def beam_search(model: NMT, test_iter, beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for _, data in enumerate(test_iter):
            print(data)
            (src_sents, src_lengths), (_, _) = data.abc, data.d
            example_hyps = model.beam_search(
                src_sents,
                src_lengths,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
Example #2
0
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
Example #3
0
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ 对源句子列表使用beam search去构建假设.
    @param model (NMT): NMT 模型
    @param test_data_src (List[List[str]]): 源句子列表, 测试集中的.
    @param beam_size (int): beam_size (每一步的候选数)
    @param max_decoding_time_step (int): Beam search 能产生的最大句子长度
    @returns hypotheses (List[List[Hypothesis]]): 每个源句子的beam_size个假设.
    """
    was_training = model.training
    model.eval()

    hypotheses = []  # 所有句子的候选句列表
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)  # 把这句话的所有候选句加入列表

    if was_training: model.train(was_training)

    return hypotheses
Example #4
0
def init_training(args):
    from functools import partial
    import pickle
    pickle.load = partial(pickle.load, encoding="latin1")
    pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1")
    # model = torch.load(model_file, map_location=lambda storage, loc: storage, pickle_module=pickle)
    vocab = torch.load(args.vocab,
                       map_location=lambda storage, loc: storage,
                       pickle_module=pickle)

    model = NMT(args, vocab)
    model.train()

    if args.uniform_init:
        print('uniformly initialize parameters [-%f, +%f]' %
              (args.uniform_init, args.uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-args.uniform_init, args.uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0
    nll_loss = nn.NLLLoss(weight=vocab_mask, reduction='sum')
    cross_entropy_loss = nn.CrossEntropyLoss(weight=vocab_mask,
                                             reduction='sum')

    if args.cuda:
        # model = nn.DataParallel(model).cuda()
        model = model.cuda()
        nll_loss = nll_loss.cuda()
        cross_entropy_loss = cross_entropy_loss.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    return vocab, model, optimizer, nll_loss, cross_entropy_loss
Example #5
0
def beam_search(model: NMT, test_iterator: BucketIterator, beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_iterator BucketIterator: BucketIterator in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        # for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
        for i, batch in enumerate(test_iterator):
            src_sents, src_sents_lens = batch.src
            src_sents = src_sents.permute(1, 0)
            for j in range(len(src_sents_lens)):
                src_sent = src_sents[j]
                example_hyps = model.beam_search(
                    src_sent,
                    src_sents_lens[j],
                    beam_size=beam_size,
                    max_decoding_time_step=max_decoding_time_step)
                hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab)
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
Example #7
0
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int)\
        -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    :param NMT model: NMT Model
    :param List[List[str]] test_data_src: List of sentences (words) in source language, from test set
    :param int beam_size: beam_size (number of hypotheses to keep for a translation at every step)
    :param int max_decoding_time_step: maximum sentence length that beam search can produce
    :returns List[List[Hypothesis]] hypotheses: List of Hypothesis translations for every source sentence
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)
            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
Example #8
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')
    # print("train data")
    # print(train_data_src)
    # print(len(train_data_tgt))
    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')
    #print("dev data")
    #print(dev_data_src)
    #rint(len(dev_data_tgt))
    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))
    #print(train_data)
    #print(dev_data)
    train_batch_size = int(args['--batch-size'])

    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])
    # print("vocab")
    # print(vocab.src)
    # print(vocab.tgt)
    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab, no_char_decoder=args['--no-char-decoder'])
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1
            #print(src_sents)
            #print(src_sents.shape)
            #print(tgt_sents)
            #print(tgt_sents.shape)
            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents) # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                         cum_loss / cum_examples,
                                                                                         np.exp(cum_loss / cum_tgt_words),
                                                                                         cum_examples), file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(), model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay'])
                        print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                        # load model
                        params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers', file=sys.stderr)
                        optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            if epoch == int(args['--max-epoch']):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)
Example #9
0
def train():
    text = Text(config.src_corpus, config.tar_corpus)
    train_data = Data(config.train_path_src, config.train_path_tar)
    dev_data = Data(config.dev_path_src, config.dev_path_tar)
    train_loader = DataLoader(dataset=train_data,
                              batch_size=config.batch_size,
                              shuffle=True,
                              collate_fn=utils.get_batch)
    dev_loader = DataLoader(dataset=dev_data,
                            batch_size=config.dev_batch_size,
                            shuffle=True,
                            collate_fn=utils.get_batch)
    parser = OptionParser()
    parser.add_option("--embed_size",
                      dest="embed_size",
                      default=config.embed_size)
    parser.add_option("--hidden_size",
                      dest="hidden_size",
                      default=config.hidden_size)
    parser.add_option("--window_size_d",
                      dest="window_size_d",
                      default=config.window_size_d)
    parser.add_option("--encoder_layer",
                      dest="encoder_layer",
                      default=config.encoder_layer)
    parser.add_option("--decoder_layers",
                      dest="decoder_layers",
                      default=config.decoder_layers)
    parser.add_option("--dropout_rate",
                      dest="dropout_rate",
                      default=config.dropout_rate)
    (options, args) = parser.parse_args()
    device = torch.device("cuda:0" if config.cuda else "cpu")
    #model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_attention/result/01.31_drop0.3_54_21.46508598886769_checkpoint.pth"
    #print(f"load model from {model_path}", file=sys.stderr)
    #model = NMT.load(model_path)
    model = NMT(text, options, device)
    #model = model.cuda()
    #model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_attention/result/140_164.29781984744628_checkpoint.pth"
    #print(f"load model from {model_path}", file=sys.stderr)
    #model = NMT.load(model_path)
    #model = torch.nn.DataParallel(model)
    model = model.to(device)
    model = model.cuda()
    model.train()
    optimizer = Optim(torch.optim.Adam(model.parameters()))
    #optimizer = Optim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.hidden_size, config.warm_up_step)
    #print(optimizer.lr)
    epoch = 0
    valid_num = 1
    hist_valid_ppl = []

    print("begin training!")
    while (True):
        epoch += 1
        max_iter = int(math.ceil(len(train_data) / config.batch_size))
        with tqdm(total=max_iter, desc="train") as pbar:
            for src_sents, tar_sents, tar_words_num_to_predict in train_loader:
                optimizer.zero_grad()
                batch_size = len(src_sents)

                now_loss = -model(src_sents, tar_sents)
                now_loss = now_loss.sum()
                loss = now_loss / batch_size
                loss.backward()

                _ = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   config.clip_grad)
                #optimizer.updata_lr()
                optimizer.step_and_updata_lr()

                pbar.set_postfix({
                    "epwwoch":
                    epoch,
                    "avg_loss":
                    loss.item(),
                    "ppl":
                    math.exp(now_loss.item() / tar_words_num_to_predict),
                    "lr":
                    optimizer.lr
                })
                #pbar.set_postfix({"epoch": epoch, "avg_loss": loss.item(), "ppl": math.exp(now_loss.item()/tar_words_num_to_predict)})
                pbar.update(1)
        #print(optimizer.lr)
        if (epoch % config.valid_iter == 0):
            #if (epoch >= config.valid_iter//2):
            if (valid_num % 5 == 0):
                valid_num = 0
                optimizer.updata_lr()
            valid_num += 1
            print("now begin validation ...", file=sys.stderr)
            eav_ppl = evaluate_ppl(model, dev_data, dev_loader)
            print("validation ppl %.2f" % (eav_ppl), file=sys.stderr)
            flag = len(hist_valid_ppl) == 0 or eav_ppl < min(hist_valid_ppl)
            if (flag):
                print("current model is the best!, save to [%s]" %
                      (config.model_save_path),
                      file=sys.stderr)
                hist_valid_ppl.append(eav_ppl)
                model.save(
                    os.path.join(
                        config.model_save_path,
                        f"02.08_window35drop0.2_{epoch}_{eav_ppl}_checkpoint.pth"
                    ))
                torch.save(
                    optimizer.optimizer.state_dict(),
                    os.path.join(
                        config.model_save_path,
                        f"02.08_window35drop0.2_{epoch}_{eav_ppl}_optimizer.optim"
                    ))
        if (epoch == config.max_epoch):
            print("reach the maximum number of epochs!", file=sys.stderr)
            return
Example #10
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    sample_rate = 22000
    resample_rate = 8000
    train_records = 8
    max_epoch = 10
    vocab = Vocab.load('dataset/vocab_full.json')
    # train_voices_files, corpus = get_voice_files_and_corpus('dataset/train/wavs', train_records)
    # voices = load_voices_files(train_voices_files, sample_rate, resample_rate)
    # train_data = list(zip(voices, corpus))

    dev_files, dev_corpus = get_voice_files_and_corpus('dataset/dev', 2)
    dev_data = list(
        zip(load_voices_files(dev_files, sample_rate, resample_rate),
            dev_corpus))

    epoch_size = 4
    train_batch_size = 2

    clip_grad = 5.0
    valid_niter = 100
    log_every = 10
    model_save_path = 'model.bin'

    model = NMT(embed_size=1024, hidden_size=2048, vocab=vocab)
    model.train()
    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    # device = torch.device("cuda:0")
    device = torch.device("cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')
    data_queue = Queue()
    batch_queue = Queue(1)
    loss_queue = Queue(2)

    train_data_to_queue_process = Process(target=load_train_data,
                                          args=('dataset/train', train_records,
                                                epoch_size, data_queue))
    train_data_to_queue_process.start()

    batch_iter_to_queue_process = Process(target=batch_iter_to_queue2,
                                          args=(data_queue, batch_queue,
                                                loss_queue, max_epoch,
                                                train_batch_size, True))
    batch_iter_to_queue_process.start()
    epoch, voices, tgt_sents = batch_queue.get(True)
    current_epoch = -1

    while voices is not None and tgt_sents is not None:

        train_iter += 1
        optimizer.zero_grad()

        # voices = load_voices_files(voice_files, sample_rate, resample_rate)
        # voices = voice_files
        batch_size = len(voices)

        example_losses = -model(voices, tgt_sents)  # (batch_size,)
        batch_loss = example_losses.sum()
        loss = batch_loss / batch_size

        loss.backward()

        # clip gradient
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   clip_grad)

        optimizer.step()

        batch_losses_val = batch_loss.item()
        report_loss += batch_losses_val
        cum_loss += batch_losses_val

        tgt_words_num_to_predict = sum(
            len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
        report_tgt_words += tgt_words_num_to_predict
        cum_tgt_words += tgt_words_num_to_predict
        report_examples += batch_size
        cum_examples += batch_size
        loss_queue.put(report_loss / report_examples)

        if train_iter % log_every == 0:
            print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                    report_loss / report_examples,
                                                                                    math.exp(report_loss / report_tgt_words),
                                                                                    cum_examples,
                                                                                    report_tgt_words / (time.time() - train_time),
                                                                                    time.time() - begin_time), file=sys.stderr)

            train_time = time.time()
            report_loss = report_tgt_words = report_examples = 0.

        # perform validation
        if train_iter % valid_niter == 0:
            print(
                'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                % (epoch, train_iter, cum_loss / cum_examples,
                   np.exp(cum_loss / cum_tgt_words), cum_examples),
                file=sys.stderr)

            cum_loss = cum_examples = cum_tgt_words = 0.
            valid_num += 1

            print('begin validation ...', file=sys.stderr)

            # compute dev. ppl and bleu
            dev_ppl = evaluate_ppl(
                model, dev_data,
                batch_size=128)  # dev batch size can be a bit larger
            valid_metric = -dev_ppl

            print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl),
                  file=sys.stderr)

            is_better = len(hist_valid_scores
                            ) == 0 or valid_metric > max(hist_valid_scores)
            hist_valid_scores.append(valid_metric)

            if is_better:
                patience = 0
                print('save currently the best model to [%s]' %
                      model_save_path,
                      file=sys.stderr)
                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
            elif patience < 10:
                patience += 1
                print('hit patience %d' % patience, file=sys.stderr)

                if patience == 5:
                    num_trial += 1
                    print('hit #%d trial' % num_trial, file=sys.stderr)
                    if num_trial == 3:
                        print('early stop!', file=sys.stderr)
                        exit(0)

                    # decay lr, and restore from previously best checkpoint
                    lr = optimizer.param_groups[0]['lr'] * 0.5
                    print(
                        'load previously best model and decay learning rate to %f'
                        % lr,
                        file=sys.stderr)

                    # load model
                    params = torch.load(
                        model_save_path,
                        map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers',
                          file=sys.stderr)
                    optimizer.load_state_dict(
                        torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr

                    # reset patience
                    patience = 0

        epoch, voices, tgt_sents = batch_queue.get()
    batch_iter_to_queue_process.join()
    train_data_to_queue_process.join()
Example #11
0
def train(args):
    """ Train the NMT Model.
    """
    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tgt, source='tgt')
    dev_data_src = read_corpus(args.dev_src, source='src')
    dev_data_tgt = read_corpus(args.dev_tgt, source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    vocab = Vocab.load(args.vocab_file)
    model = NMT(embed_size=args.embed_size,
                hidden_size=args.hidden_size,
                dropout_rate=args.dropout,
                vocab=vocab)
    model.train()

    if np.abs(args.uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (args.uniform_init, args.uniform_init))
        for p in model.parameters():
            p.data.uniform_(-args.uniform_init, args.uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args.cuda else "cpu")
    print('use device: %s' % device)

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1
        batch_num = math.ceil(len(train_data) / args.batch_size)
        current_iter = 0
        for src_sents, tgt_sents in batch_iter(train_data, batch_size=args.batch_size, shuffle=True):
            current_iter += 1
            train_iter += 1

            optimizer.zero_grad()
            batch_size = len(src_sents)
            example_losses = -model(src_sents, tgt_sents)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size
            loss.backward()

            # clip gradient
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            # omitting leading `<s>`
            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % args.log_every == 0:
                print('epoch %d (%d / %d), iter %d, avg. loss %.2f, avg. ppl %.2f '
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' %
                      (epoch, current_iter, batch_num, train_iter,
                       report_loss / report_examples,
                       math.exp(report_loss / report_tgt_words),
                       cum_examples,
                       report_tgt_words / (time.time() - train_time),
                       time.time() - begin_time))

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % args.valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                      cum_loss / cum_examples,
                      np.exp(cum_loss / cum_tgt_words),
                      cum_examples))

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...')

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl))

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('epoch %d, iter %d: save currently the best model to [%s]' %
                          (epoch, train_iter, args.model_path))
                    model.save(args.model_path)
                    torch.save(optimizer.state_dict(), args.model_path + '.optim')
                elif patience < args.patience:
                    patience += 1
                    print('hit patience %d' % patience)

                    if patience == args.patience:
                        num_trial += 1
                        print('hit #%d trial' % num_trial)
                        if num_trial == args.max_num_trial:
                            print('early stop!')
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * args.lr_decay
                        print('load previously best model and decay learning rate to %f' % lr)

                        # load model
                        params = torch.load(args.model_path, map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers')
                        optimizer.load_state_dict(torch.load(args.model_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            if epoch == args.max_epoch:
                print('reached maximum number of epochs!')
                return
Example #12
0
def train(index):
    torch.manual_seed(1)
    if (config.cuda):
        torch.cuda.manual_seed(1)
    device = torch.device(f"cuda:{index}" if config.cuda else "cpu")
    dist_rank = index
    torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=dist_rank, world_size=1)
    is_master_node = (dist_rank == 0)
    
    args = dict()
    args['embed_size'] = config.embed_size
    args['d_model'] = config.d_model
    args['nhead'] = config.nhead
    args['num_encoder_layers'] = config.num_encoder_layers
    args['num_decoder_layers'] = config.num_decoder_layers
    args['dim_feedforward'] = config.dim_feedforward
    args['dropout'] = config.dropout
    args['smoothing_eps'] = config.smoothing_eps
    
    text = Text(config.src_corpus, config.tar_corpus)
    model = NMT(text, args, device)
    model = make_data_parallel(model, device)
    
    train_data = Data(config.train_path_src, config.train_path_tar)
    dev_data = Data(config.dev_path_src, config.dev_path_tar)
    train_sampler = DistributedSampler(train_data)
    dev_sampler = DistributedSampler(dev_data)
    train_loader = DataLoader(dataset=train_data, batch_size=int(config.train_batch_size/8), shuffle=False, num_workers=9, pin_memory=True, sampler=train_sampler, collate_fn=utils.get_batch)
    dev_loader = DataLoader(dataset=dev_data, batch_size=int(config.dev_batch_size/8), shuffle=False, num_workers=9, pin_memory=True, sampler=dev_sampler, collate_fn=utils.get_batch)

    model.train()
    optimizer = Optim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.d_model, config.warm_up_step)

    epoch = 0
    history_valid_ppl = []
    print("begin training!", file=sys.stderr)
    while (True):
        epoch += 1
        train_loader.sampler.set_epoch(epoch)
        max_iter = int(math.ceil(len(train_data)/config.train_batch_size))
        with tqdm(total=max_iter, desc="train") as pbar:
            for batch_src, batch_tar, tar_word_num in train_loader:
                optimizer.zero_grad()
                now_batch_size = len(batch_src)
                batch_loss = -model(batch_src, batch_tar, smoothing=True)
                batch_loss = batch_loss.sum()
                loss = batch_loss / now_batch_size
                loss.backward()
                torch.distributed.barrier()
                optimizer.step_and_updata_lr()
                if (is_master_node):
                    pbar.set_postfix({"epoch": epoch, "avg_loss": '{%.2f}' % (loss.item()), "ppl": '{%.2f}' % (batch_loss.item()/tar_word_num)})
                    pbar.update(1)
        if (epoch % config.valid_iter == 0):
            print("now begin validation...", file=sys.stderr)
            torch.distributed.barrier()
            eval_ppl = evaluate_ppl(model, dev_data, dev_loader, config.dev_batch_size, is_master_node)
            print(eval_ppl)
            flag = len(history_valid_ppl) == 0 or eval_ppl < min(history_valid_ppl)
            if (flag):
                print(f"current model is the best! save to [{config.model_save_path}]", file=sys.stderr)
                history_valid_ppl.append(eval_ppl)
                model.save(os.path.join(config.model_save_path, f"02.19_{epoch}_{eval_ppl}_checkpoint.pth"))
                torch.save(optimizer.optimizer.state_dict(), os.path.join(config.model_save_path, f"02.19_{epoch}_{eval_ppl}_optimizer.optim"))
        if (epoch == config.max_epoch):
            print("reach the maximum number of epochs!", file=sys.stderr)
            return
Example #13
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])

    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    existing_model = args['--existing-model-path']
    start_from_existing_model = existing_model and os.path.isfile(
        existing_model)
    if start_from_existing_model:
        print("load model from {}".format(existing_model), file=sys.stderr)
        model = NMT.load(existing_model,
                         no_char_decoder=args['--no-char-decoder'])
    else:
        print("Create a new model from hyper parameters")
        model = NMT(embed_size=int(args['--embed-size']),
                    hidden_size=int(args['--hidden-size']),
                    dropout_rate=float(args['--dropout']),
                    vocab=vocab,
                    no_char_decoder=args['--no-char-decoder'])
    model.train()

    print_model_param_count(model)

    # TODO: How to print all the parameters of this model? And is it useful?
    if not start_from_existing_model:
        uniform_init = float(args['--uniform-init'])
        if np.abs(uniform_init) > 0.:
            print('uniformly initialize parameters [-%f, +%f]' %
                  (uniform_init, uniform_init),
                  file=sys.stderr)
            for p in model.parameters():
                p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    avg_train_ppls = []
    avg_valid_ppls = []

    # output_file_path = 'outputs/loss_%s' % datetime.datetime.now().strftime("%m-%d-%Y-%I:%M%p")
    output_file_path = os.path.join(
        args['--ppl-save-dir'],
        'ppl.json') if args['--ppl-save-dir'] else 'ppl.json'

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size=train_batch_size,
                                               shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         np.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)
                avg_train_ppls.append(np.exp(report_loss / report_tgt_words))
                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                # The printed values are the train loss
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(
                    model, dev_data,
                    batch_size=128)  # dev batch size can be a bit larger
                avg_valid_ppls.append(dev_ppl)

                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            output_losses(args, log_every, valid_niter,
                                          avg_train_ppls, avg_valid_ppls,
                                          output_file_path)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # TODO: len(optimizer.param_groups) == 1 ? Or the below code seems odd
                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            if epoch == int(args['--max-epoch']):
                print('reached maximum number of epochs!', file=sys.stderr)
                output_losses(args, log_every, valid_niter, avg_train_ppls,
                              avg_valid_ppls, output_file_path)
                exit(0)
        output_losses(args, log_every, valid_niter, avg_train_ppls,
                      avg_valid_ppls, output_file_path)
        if args['--is-google-colab'] and epoch % 2 == 0 and os.path.isfile(
                model_save_path):
            shutil.copy(model_save_path, args['--ppl-save-dir'])
            shutil.copy(model_save_path + '.optim', args['--ppl-save-dir'])
            print("copied model files to google drive!")
Example #14
0
def train():
    torch.manual_seed(1)
    if (config.cuda):
        torch.cuda.manual_seed(1)
    args = dict()
    args['embed_size'] = config.embed_size
    args['d_model'] = config.d_model
    args['nhead'] = config.nhead
    args['num_encoder_layers'] = config.num_encoder_layers
    args['num_decoder_layers'] = config.num_decoder_layers
    args['dim_feedforward'] = config.dim_feedforward
    args['dropout'] = config.dropout
    args['smoothing_eps'] = config.smoothing_eps
    text = Text(config.src_corpus, config.tar_corpus)
    train_data = Data(config.train_path_src, config.train_path_tar)
    dev_data = Data(config.dev_path_src, config.dev_path_tar)
    train_loader = DataLoader(dataset=train_data,
                              batch_size=config.train_batch_size,
                              shuffle=True,
                              collate_fn=utils.get_batch)
    dev_loader = DataLoader(dataset=dev_data,
                            batch_size=config.dev_batch_size,
                            shuffle=True,
                            collate_fn=utils.get_batch)
    #train_data_src, train_data_tar = utils.read_corpus(config.train_path)
    #dev_data_src, dev_data_tar = utils.read_corpus(config.dev_path)
    device = torch.device("cuda:0" if config.cuda else "cpu")
    model = NMT(text, args, device)
    #model = nn.DataParallel(model, device_ids=[0, 1])
    model = model.to(device)
    #model = model.module
    #model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_transformer/result/02.01_1_344.6820465077113_checkpoint.pth"
    #model = NMT.load(model_path)
    #model = model.to(device)
    model.train()
    optimizer = Optim(
        torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9),
        config.d_model, config.warm_up_step)
    #optimizer = Optim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.warm_up_step, config.init_lr, config.lr)
    #optimizer = Optim(torch.optim.Adam(model.parameters()))

    epoch = 0
    history_valid_ppl = []
    print("begin training!", file=sys.stderr)
    while (True):
        epoch += 1
        max_iter = int(math.ceil(len(train_data) / config.train_batch_size))
        with tqdm(total=max_iter, desc="train") as pbar:
            #for batch_src, batch_tar, tar_word_num in utils.batch_iter(train_data_src, train_data_tar, config.train_batch_size):
            for batch_src, batch_tar, tar_word_num in train_loader:
                optimizer.zero_grad()
                now_batch_size = len(batch_src)
                batch_loss = -model(batch_src, batch_tar, smoothing=True)
                batch_loss = batch_loss.sum()
                loss = batch_loss / now_batch_size
                loss.backward()
                #optimizer.step()
                #optimizer.updata_lr()
                optimizer.step_and_updata_lr()
                pbar.set_postfix({
                    "epoch":
                    epoch,
                    "avg_loss":
                    '{%.2f}' % (loss.item()),
                    "ppl":
                    '{%.2f}' % (math.exp(batch_loss.item() / tar_word_num))
                })
                pbar.update(1)
        if (epoch % config.valid_iter == 0):
            print("now begin validation...", file=sys.stderr)
            eval_ppl = evaluate_ppl(model, dev_data, dev_loader,
                                    config.dev_batch_size)
            print(eval_ppl)
            flag = len(
                history_valid_ppl) == 0 or eval_ppl < min(history_valid_ppl)
            if (flag):
                print(
                    f"current model is the best! save to [{config.model_save_path}]",
                    file=sys.stderr)
                history_valid_ppl.append(eval_ppl)
                model.save(
                    os.path.join(config.model_save_path,
                                 f"02.10_{epoch}_{eval_ppl}_checkpoint.pth"))
                torch.save(
                    optimizer.optimizer.state_dict(),
                    os.path.join(config.model_save_path,
                                 f"02.10_{epoch}_{eval_ppl}_optimizer.optim"))
        if (epoch == config.max_epoch):
            print("reach the maximum number of epochs!", file=sys.stderr)
            return
Example #15
0
File: run.py Project: aaniin/cs224n
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab)
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    # Set counters
    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    fwd_time = train_time = begin_time = time.time()

    # Begin training
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        #  Loop over all data in selection batches
        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size=train_batch_size,
                                               shuffle=True):

            # Sentences must be sorted in length (that is number of words)
            src_sents = sorted(src_sents, key=lambda e: len(e), reverse=True)
            tgt_sents = sorted(tgt_sents, key=lambda e: len(e), reverse=True)

            train_iter += 1
            # Zero out gradients, pytorch accumulates them
            optimizer.zero_grad()

            # Get loss
            train_batch_losses = (-model.forward(src_sents, tgt_sents))
            batch_loss = train_batch_losses.sum()
            loss = batch_loss / train_batch_size

            # Get gradients
            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            # step
            optimizer.step()

            # Report progress
            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            # Get some report metric
            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += train_batch_size
            cum_examples += train_batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                        'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                            report_loss / report_examples,
                                                                                            math.exp(report_loss / report_tgt_words),
                                                                                            cum_examples,
                                                                                            report_tgt_words / (time.time() - train_time),
                                                                                            time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

                # Test saving and loading the model
                # test_save_load_model(model=model,optimizer=optimizer)

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f, cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                #dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
                dev_ppl = evaluate_ppl(model,
                                       dev_data,
                                       batch_size=train_batch_size *
                                       2)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        #params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                        # See https://github.com/pytorch/pytorch/issues/7415 and
                        # https://discuss.pytorch.org/t/on-a-cpu-device-how-to-load-checkpoint-saved-on-gpu-device/349 and
                        # https://github.com/pytorch/pytorch/issues/9139
                        params = torch.load(model_save_path,
                                            map_location='cpu')
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        # optimizer.load_state_dict(torch.load(model_save_path + '.optim')
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim',
                                       map_location='cpu'))
                        optimizer_to(optimizer, device)

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
Example #16
0
def experiement(args: Dict, test_only, device):
    """ Train and Test the NMT Model.
    @param args (Dict): args from cmd line
    """
    # train_data_src = read_corpus(args['--train-src'], source='src')
    # train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')
    #
    # dev_data_src = read_corpus(args['--dev-src'], source='src')
    # dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')
    #
    # train_data = list(zip(train_data_src, train_data_tgt))
    # dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    use_pos_embed = False
    if args['--use-pos-embed']:
        use_pos_embed = True

    use_copy = False
    if args['--use-copy']:
        use_copy = True

    SRC, TRG, train_iterator, dev_iterator, test_iterator = load_data(
        args['--train-data'], args['--dev-data'], args['--test-data'], device,
        train_batch_size, (use_pos_embed or use_copy))

    vocab = Vocab(SRC, TRG)

    model = NMT(src_embed_size=int(args['--src-embed-size']),
                dst_embed_size=int(args['--dst-embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab,
                use_pos_embed=use_pos_embed,
                use_copy=use_copy)

    model.load_pretrained_embeddings(vocab)

    # print("args: {}".format(args))

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    # def init_weights(m):
    #     for name, param in m.named_parameters():
    #         if 'weight' in name:
    #             nn.init.normal_(param.data, mean=0, std=0.01)
    #         else:
    #             nn.init.constant_(param.data, 0)
    #
    # model.apply(init_weights)

    # vocab_mask = torch.ones(len(vocab.tgt))
    # vocab_mask[vocab.tgt['<pad>']] = 0

    print('use device: %s' % device, file=sys.stderr)
    print(model)

    para_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'The model has {para_count:,} trainable parameters')
    print("file path: {}".format(model_save_path))

    if test_only:
        model.eval()
        decode(args, test_iterator, vocab, device)
        exit(0)

    # perform training
    model.train()
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        #perform training
        model.train()
        # for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
        for i, batch in enumerate(train_iterator):
            train_iter += 1

            optimizer.zero_grad()
            src_sents, src_sents_lens = batch.src
            tgt_sents = batch.trg
            batch_size = src_sents.shape[1]

            example_losses = -model(src_sents, src_sents_lens,
                                    tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

        # if train_iter % log_every == 0:
        # print("")
        print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
              'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

        train_time = time.time()
        report_loss = report_tgt_words = report_examples = 0.

        # perform validation
        # model.eval()
        # if train_iter % valid_niter == 0:
        # print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
        #                                                                                      cum_loss / cum_examples,
        #                                                                                      np.exp(cum_loss / cum_tgt_words),
        #                                                                                      cum_examples), file=sys.stderr)

        cum_loss = cum_examples = cum_tgt_words = 0.
        valid_num += 1

        # print('begin validation ...', file=sys.stderr)

        # compute dev. ppl and bleu
        dev_ppl = evaluate_ppl(
            model, dev_iterator,
            batch_size=128)  # dev batch size can be a bit larger
        valid_metric = -dev_ppl

        print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl),
              file=sys.stderr)

        is_better = len(
            hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
        hist_valid_scores.append(valid_metric)

        if is_better:
            patience = 0
            # print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
            model.save(model_save_path)

            # also save the optimizers' state
            torch.save(optimizer.state_dict(), model_save_path + '.optim')
        elif patience < int(args['--patience']):
            patience += 1
            print('hit patience %d' % patience, file=sys.stderr)

            if patience == int(args['--patience']):
                num_trial += 1
                print('hit #%d trial' % num_trial, file=sys.stderr)
                if num_trial == int(args['--max-num-trial']):
                    print('early stop!', file=sys.stderr)
                    # exit(0)
                    break

                # decay lr, and restore from previously best checkpoint
                lr = optimizer.param_groups[0]['lr'] * float(
                    args['--lr-decay'])
                print(
                    'load previously best model and decay learning rate to %f'
                    % lr,
                    file=sys.stderr)

                # load model
                params = torch.load(model_save_path,
                                    map_location=lambda storage, loc: storage)
                model.load_state_dict(params['state_dict'])
                model = model.to(device)

                print('restore parameters of the optimizers', file=sys.stderr)
                optimizer.load_state_dict(
                    torch.load(model_save_path + '.optim'))

                # set new lr
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

                # reset patience
                patience = 0

        if epoch == int(args['--max-epoch']):
            print('reached maximum number of epochs!', file=sys.stderr)
            break

    # perform testing
    model.eval()
    decode(args, test_iterator, vocab, device)
Example #17
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    #prefer to do our entire train,test,val split in the code itself as opposed to our previous script
    # remove these comments

    #data preprocessing for Qs and As.
    spacy_en = spacy.load('en')

    def tokenizer(text):  # create a tokenizer function
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 include_lengths=True,
                 init_token='<s>',
                 eos_token='</s>')
    analogies_datafields = [("abc", TEXT), ("d", TEXT)]

    train, val, test = TabularDataset.splits(
        path="data",  # the root directory where the data lies
        train='ngram_train.csv',
        validation="ngram_val.csv",
        test='ngram_test.csv',
        format='csv',
        skip_header=
        False,  # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=analogies_datafields)

    pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt')
    TEXT.build_vocab(
        vectors=pretrained_vecs)  # specials=['<pad>', '<s>', '</s>']

    if args['--cuda'] == 'cpu':
        torch_text_device = -1
    else:
        torch_text_device = 0

    training_iter, val_iter, test_iter = Iterator.splits(
        (train, val, test),
        sort_key=lambda x: len(x.abc),
        batch_sizes=(100, 20, 1),
        device=torch_text_device,
        sort_within_batch=True)

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=TEXT.vocab)
    model.train()  #sets training = True

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    writer = SummaryWriter('logs')
    is_better_count = 0  #TODO: Remove this and debug the nonstopping part
    while True:
        epoch += 1

        for _, data in enumerate(training_iter):
            (src_sents, src_lengths), (tgt_sents, _) = data.abc, data.d

            train_iter += 1

            optimizer.zero_grad()

            batch_size = src_sents.shape[1]

            example_losses = model(src_sents, src_lengths,
                                   tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                writer.add_scalar('Train/AvgLoss',
                                  report_loss / report_examples, epoch)
                writer.add_scalar('Train/AvgPPL',
                                  math.exp(report_loss / report_tgt_words),
                                  epoch)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl, val_loss = evaluate_ppl(
                    model, val_iter)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f, dev loss %f' %
                      (train_iter, dev_ppl, val_loss),
                      file=sys.stderr)
                writer.add_scalar('Val/AvgPPL', dev_ppl, epoch)
                writer.add_scalar('Val/AvgLoss', val_loss, epoch)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                print(hist_valid_scores)
                print(valid_metric)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)
                    is_better_count = is_better_count + 1
                    print(is_better_count)
                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                    if is_better_count > 3:
                        print('reached maximum number of epochs!',
                              file=sys.stderr)
                        writer.close()
                        exit(0)

                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    writer.close()
                    exit(0)
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    long_logfile = "long_logfiles/" + str(time.time()) + "long.txt"
    train_logfile = "train_logfiles/" + str(time.time()) + "train.txt"
    dev_logfile = "dev_logfiles/" + str(time.time()) + "dev.txt"
    f_long = open(long_logfile, "w")
    f_train = open(train_logfile, "w")
    # TODO: add hyperparameters
    args_tuples = [(arg, args[arg]) for arg in args]
    f_train.write("#args_tuples: %s\n" % args_tuples)
    for (arg, val) in args_tuples:
        f_train.write("#%s: %s\n" % (arg, val))
    f_train.write("#epoch, train iter, train score\n")
    f_dev = open(dev_logfile, "w")
    f_dev.write("#epoch, train iter, dev score, dev accuracy\n")

    binary = int(args["--num-classes"]) == 2

    train_data = load_train_data(perct=float(args["--train-perct"]),
                                 binary=binary)
    dev_data = load_dev_data(dev_perct=float(args["--dev-perct"]),
                             binary=binary)

    train_batch_size = int(args["--batch-size"])
    clip_grad = float(args["--clip-grad"])
    valid_niter = int(args["--valid-niter"])
    log_every = int(args["--log-every"])
    model_save_path = args["--save-to"]

    embed_size = int(args["--embed-size"])

    # TODO: load train data_augmenter based on args
    data_augmenter = str(args["--data-aug"]).lower()
    print_and_write("Using data augmentation method: %s" % data_augmenter,
                    f_long)
    if data_augmenter == "gaussian":
        data_augmenter = GaussianNoiseDataAugmenter(
            float(args["--data-aug-amount"]), int(args["--data-aug-nx"]))
    elif data_augmenter == "identity":
        data_augmenter = NoisyIdentityDataAugmenter(
            float(args["--data-aug-amount"]), int(args["--data-aug-nx"]))
    elif data_augmenter == "swapdim":
        data_augmenter = EmbedDimensionSwapDataAugmenter(
            int(args["--data-aug-amount"]), int(args["--data-aug-nx"]))
    else:
        data_augmenter = BaseDataAugmenter()

    # perform augmentation
    train_data_aug = data_augmenter.augment(train_data)
    print_and_write(
        "train size: %d, after aug %d" %
        (len(train_data[0]), len(train_data_aug)),
        f_long,
    )

    model = NMT(embed_size=embed_size,
                hidden_size=int(args["--hidden-size"]),
                num_classes=int(args["--num-classes"]),
                dropout_rate=float(args["--dropout"]))
    model.train()

    uniform_init = float(args["--uniform-init"])
    if np.abs(uniform_init) > 0.0:
        print_and_write(
            "uniformly initialize parameters [-%f, +%f]" %
            (uniform_init, uniform_init),
            f_long,
        )
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    device = torch.device("cuda:0" if args["--cuda"] else "cpu")
    print_and_write("use device: %s" % device, f_long)
    model = model.to(device)
    print_and_write("confirming model device %s" % model.device, f_long)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args["--lr"]))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print_and_write("begin Maximum Likelihood training", f_long)

    while True:
        epoch += 1

        for sentences, sentiments in batch_iter(train_data_aug,
                                                batch_size=train_batch_size,
                                                shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            example_losses = -model(sentences, sentiments)  # (batch_size,)
            batch_size = len(
                example_losses)  # in case data augmentation makes returned
            # number of examples > input batch size
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                # train_accuracy = model.compute_accuracy(sentences, sentiments)
                print_and_write(
                    "epoch %d, iter %d, avg. loss %.2f, "
                    "cum. examples %d, time elapsed %.2f sec" % (
                        epoch,
                        train_iter,
                        report_loss / report_examples,
                        cum_examples,
                        time.time() - begin_time,
                    ),
                    f_long,
                )
                f_train.write(
                    "%d, %d, %.2f\n" %
                    (epoch, train_iter, report_loss / report_examples))

                train_time = time.time()
                report_loss = report_examples = 0.0

            # perform validation
            if train_iter % valid_niter == 0:
                cum_loss = cum_examples = 0.0
                valid_num += 1

                print_and_write("begin validation ...", f_long)

                # compute dev
                dev_score, dev_accuracy = evaluate_dev(
                    model, dev_data,
                    batch_size=5000)  # dev batch size can be a bit larger
                valid_metric = -dev_score  # maybe use accuracy instead?

                print_and_write(
                    "validation: iter %d, dev. score %f, dev. accuracy %f" %
                    (train_iter, dev_score, dev_accuracy),
                    f_long,
                )
                f_dev.write("%d, %d, %f, %f\n" %
                            (epoch, train_iter, dev_score, dev_accuracy))

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                # train_score = evaluate_dev(model, train_data, batch_size=100000)

                if is_better:
                    patience = 0
                    print_and_write(
                        "save currently the best model to [%s]" %
                        model_save_path,
                        f_long,
                    )
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + ".optim")
                elif patience < int(args["--patience"]):
                    patience += 1
                    print_and_write("hit patience %d" % patience, f_long)

                    if patience == int(args["--patience"]):
                        num_trial += 1
                        print_and_write("hit #%d trial" % num_trial, f_long)
                        if num_trial == int(args["--max-num-trial"]):
                            print_and_write("early stop!", f_long)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]["lr"] * float(
                            args["--lr-decay"])
                        print_and_write(
                            "load previously best model and decay learning rate to %f"
                            % lr,
                            f_long,
                        )

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params["state_dict"])
                        model = model.to(device)

                        print_and_write("restore parameters of the optimizers",
                                        f_long)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + ".optim"))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group["lr"] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args["--max-epoch"]):
                    print_and_write("reached maximum number of epochs!",
                                    f_long)
                    exit(0)
Example #19
0
def train(args: Dict):
    """ Train the NMT Model.
    :param Dict args: arguments from command line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    # Lists of (src_sent, tgt_sent) tuples
    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])

    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab,
                no_char_decoder=args['--no-char-decoder'])
    model.train()  # Set to train mode

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)  # Initialize in-place

    # vocab_mask = torch.ones(len(vocab.tgt))
    # vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)
    print('device name: ', torch.cuda.get_device_name(device))
    print('device available: ', torch.cuda.is_available())

    model = model.to(device)  # Send model parameters to the chosen device

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    # Initialize necessary variables
    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0

    # To keep track of previous scores
    hist_valid_scores = []

    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    # Load the previous model parameters if they exist
    if os.path.isfile('model.bin'):
        print('Load previous best model...', file=sys.stderr)
        # load model
        params = torch.load(model_save_path,
                            map_location=lambda storage, loc: storage)
        model.load_state_dict(params['state_dict'])
        model = model.to(device)

        print('restore parameters of the optimizers', file=sys.stderr)
        optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

        # save the loaded previous best model as current best - otherwise they will be overwritten
        dev_ppl = evaluate_ppl(
            model, dev_data,
            batch_size=64)  # dev batch size can be a bit larger
        valid_metric = -dev_ppl
        hist_valid_scores.append(valid_metric)

    while True:
        epoch += 1
        # Iterate over lazily generated batches (lists) of sentences (each sentence is a list of words)
        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size=train_batch_size,
                                               shuffle=True):
            train_iter += 1
            # Zero gradients, otherwise they would be accumulated across batches
            optimizer.zero_grad()

            batch_size = len(src_sents)

            # Calculate the losses for each example in the batch (i.e. forward propagation)
            example_losses = -model(src_sents, tgt_sents)  # Dim: (batch_size,)

            # Average losses over the entire batch
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            # Compute gradients
            loss.backward()

            # Clip gradients
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            # Update parameters
            optimizer.step()

            # Get a number from a tensor containing a single scalar
            batch_losses_val = batch_loss.item()

            # Add to the ''report_loss'' (zeroed every ''log_every'' iterations - for logging)
            report_loss += batch_losses_val

            # Add to the ''cum_loss'' (zeroed every ''valid_niter'' iterations - for validation)
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # Perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl
                dev_ppl = evaluate_ppl(
                    model, dev_data,
                    batch_size=64)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                # Lower perplexity is better
                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            if epoch == int(args['--max-epoch']):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)
Example #20
0
def train(model_config,
          data_config,
          output_path,
          device,
          epoch_size,
          max_epoch,
          batch_size,
          repeats,
          decade_rate,
          clip_grad,
          log_every,
          valid_every,
          learning_rate=0.0005):
    print('use device: %s' % device, file=sys.stderr)
    vocab = Vocab.load(data_config["vacab_file"])
    model = NMT(vocab=vocab, **model_config)
    model = model.to(torch.device(device))
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    data_config.pop("vacab_file", None)
    data_loader = DataLoader(**data_config)
    batch_queue, loss_queue = data_loader.load_train_data(
        epoch_size, max_epoch, batch_size, repeats, decade_rate)
    dev_data = data_loader.load_dev_data()

    hist_valid_scores = []
    train_losses = []
    train_iter = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0

    if os.path.isfile(output_path + '/speech-to-text.model'):
        print('loading saved model...')
        params = torch.load(output_path + '/speech-to-text.model',
                            map_location=lambda storage, loc: storage)
        model.load_state_dict(params['state_dict'])
        print('restoring parameters of the optimizers', file=sys.stderr)
        optimizer.load_state_dict(
            torch.load(output_path + '/speech-to-text.optim'))
        dev_ppl = evaluate_ppl(
            model, dev_data,
            batch_size=128)  # dev batch size can be a bit larger
        valid_metric = -dev_ppl
        hist_valid_scores.append(valid_metric)
        print("saved model ppl: ", dev_ppl)

    model.train()

    train_time = begin_time = time.time()
    epoch, voices, tgt_sents = batch_queue.get(True)
    while voices is not None and tgt_sents is not None:
        train_iter += 1
        optimizer.zero_grad()
        # print("received voices:", len(voices))
        # print("tgt_sents[0]:", len(tgt_sents[0]), tgt_sents[0])
        # print("tgt_sents[1]:", len(tgt_sents[1]), tgt_sents[1])
        optimizer.zero_grad()
        batch_size = len(voices)
        sample_losses = -model(voices, tgt_sents)
        batch_loss = sample_losses.sum()
        loss = batch_loss / batch_size
        loss.backward()

        # clip gradient
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   clip_grad)
        optimizer.step()

        batch_losses_val = batch_loss.item()
        report_loss += batch_losses_val
        cum_loss += batch_losses_val

        tgt_words_num_to_predict = sum(
            len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
        report_tgt_words += tgt_words_num_to_predict
        cum_tgt_words += tgt_words_num_to_predict
        report_examples += batch_size
        cum_examples += batch_size
        loss_queue.put(report_loss / report_examples)
        train_losses.append({
            'epoch':
            epoch,
            'iter':
            train_iter,
            'loss':
            report_loss / report_examples,
            'ppl':
            math.exp(report_loss / report_tgt_words),
            'cum':
            cum_examples,
            'speed':
            report_tgt_words / (time.time() - train_time)
        })

        if train_iter % log_every == 0:
            print(
                'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f '
                'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec'
                % (epoch, train_iter, report_loss / report_examples,
                   math.exp(report_loss / report_tgt_words), cum_examples,
                   report_tgt_words /
                   (time.time() - train_time), time.time() - begin_time),
                file=sys.stderr)

            train_time = time.time()
            report_loss = report_tgt_words = report_examples = 0.
        # perform validation
        if train_iter % valid_every == 0:
            print(
                'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                % (epoch, train_iter, cum_loss / cum_examples,
                   np.exp(cum_loss / cum_tgt_words), cum_examples),
                file=sys.stderr)

            cum_loss = cum_examples = cum_tgt_words = 0.
            valid_num += 1

            print('begin validation ...', file=sys.stderr)

            # compute dev. ppl and bleu
            dev_ppl = evaluate_ppl(
                model, dev_data,
                batch_size=128)  # dev batch size can be a bit larger
            valid_metric = -dev_ppl

            print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl),
                  file=sys.stderr)

            is_better = len(hist_valid_scores
                            ) == 0 or valid_metric > max(hist_valid_scores)
            hist_valid_scores.append(valid_metric)

            if is_better:
                patience = 0
                print('save currently the best model to [%s]' % output_path,
                      file=sys.stderr)
                model.save(output_path + '/speech-to-text.model')
                torch.save(optimizer.state_dict(),
                           output_path + '/speech-to-text.optim')

        epoch, voices, tgt_sents = batch_queue.get(True)
Example #21
0
def train(train_data,
          dev_data,
          vocab,
          embed_size=256,
          hidden_size=256,
          dropout_rate=0.2,
          uniform_init=0.1,
          device='cpu',
          lr=0.001,
          batch_size=32,
          clip_grad=5.0,
          log_every=10,
          valid_niter=2000,
          save_path='model.bin',
          patience=5,
          lr_decay=0.5,
          max_trials=5,
          max_epochs=30):
    """ Train the NMT model
        Params:
            train_data (list of (src_sent, tgt_sent)): list of tuples containing source and target 
                sentences for training
            dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target 
                sentences for dev
            vocab (Vocab): Vocab object for source and target
            embed_size (int): Embedding dimensionality. Default = 256
            hidden_size (int): Dimensionality for hidden states. Default = 256
            dropout_rate (float): Dropout probability. Default: 0.2
            uniform_init (float): If > 0: uniformly initialize all parameters
            device (str): device to perform the calc on. Default = 'cpu'
            lr (float): learning rate. Default = 0.001
            batch_size (int): batch size. Default = 32
            clip_grad (float): used in gradient clipping. Default = 5.0
            log_every (int): number of iterations to print stats. Default = 10
            valid_niter (int): number of iterations to perform validation. Default = 2000
            save_path (str): path to save the best model. Default: 'model.bin' in current dir
            patience (int): number of iterations to decay learning rate. Default = 5
            lr_decay (float): learning rate decay. Default = 0.5
            max_trials (int): terminate training after how many trials. Default = 5
            max_epochs (int): max number of epochs. Default = 30
        Return:
    """
    # Create NMT model and put it in train mode
    model = NMT(embed_size, hidden_size, vocab, dropout_rate)
    model.train()

    # Uniformely initialize model parameters if required
    if np.abs(uniform_init) > 0.:
        print(f'uniformly init parameters [-{uniform_init}, +{uniform_init}]',
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    # Create target vocab mask with 0 for 'padding' index and 1 otherwise
    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    # Set model device
    device = torch.device(device)
    model = model.to(device)
    print(f'Using device: {device}', file=sys.stderr)

    # Choose optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Initializations
    num_trial = 0
    train_iter = 0
    current_patience = 0
    cum_loss = 0
    report_loss = 0
    cum_tgt_words = 0
    report_tgt_words = 0
    cum_examples = 0
    report_examples = 0
    epoch = 0
    valid_num = 0
    hist_valid_scores = []
    train_time = time.time()
    begin_time = time.time()

    print('begin Maximum Likelihood training')

    while True:
        epoch += 1
        # Iterate over the batches in the training data
        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size,
                                               shuffle=True):
            train_iter += 1
            optimizer.zero_grad()
            current_batch_size = len(src_sents)

            # Calculate loss and backpropagate
            example_losses = -model(src_sents,
                                    tgt_sents)  # (current_batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / current_batch_size  # average loss
            loss.backward()

            # clip gradient and update parameters
            _ = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print(f'epoch {epoch}, iter {train_iter}, ' \
                      f'avg. loss {report_loss / report_examples:.2f}, '\
                      f'avg. ppl {math.exp(report_loss / report_tgt_words):.2f}, ' \
                      f'cum. examples {cum_examples}, ' \
                      f'speed {report_tgt_words / (time.time() - train_time):.2f} words/sec, ' \
                      f'time elapsed {(time.time() - begin_time):.2f} sec', file=sys.stderr)
                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(f'epoch {epoch}, iter {train_iter}, cum. loss {cum_loss / cum_examples:.2f}, '\
                      f'cum. ppl {np.exp(cum_loss / cum_tgt_words):.2f} cum. examples {cum_examples}',
                      file=sys.stderr)
                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1
                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(
                    model, dev_data,
                    batch_size=128)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print(f'validation: iter {train_iter}, dev. ppl {dev_ppl}',
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    # save model and optimizer state
                    print(f'save the best model to [{save_path}]',
                          file=sys.stderr)
                    model.save(save_path)
                    torch.save(optimizer.state_dict(), save_path + '.optim')
                    current_patience = 0

                elif current_patience < patience:
                    current_patience += 1
                    print(f'hit patience {current_patience}', file=sys.stderr)

                    if current_patience == patience:
                        num_trial += 1
                        print(f'hit #{num_trial} trial', file=sys.stderr)
                        if num_trial == max_trials:
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * lr_decay
                        print(
                            f'load previously best model and decay learning rate to {lr}',
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        current_patience = 0

                if epoch == max_epochs:
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)