def sample_ngram_adapt(args): src_sents = read_corpus(args.src, 'src') tgt_sents = read_corpus(args.tgt, 'src') # do not read in <s> and </s> f_out = open(args.output, 'w') vocab = torch.load(args.vocab) tgt_vocab = vocab.tgt max_len = max([len(tgt_sent) for tgt_sent in tgt_sents]) + 1 for src_sent, tgt_sent in zip(src_sents, tgt_sents): src_sent = ' '.join(src_sent) tgt_len = len(tgt_sent) tgt_samples = [] # generate 100 samples # append itself tgt_samples.append(tgt_sent) for sid in range(args.sample_size - 1): max_n = min(tgt_len - 1, 4) bias_n = int(max_n * tgt_len / max_len) + 1 assert 1 <= bias_n <= 4, 'bias_n={}, not in [1,4], max_n={}, tgt_len={}, max_len={}'.format(bias_n, max_n, tgt_len, max_len) p = [1.0 / (max_n + 5)] * max_n p[bias_n - 1] = 1 - p[0] * (max_n - 1) assert abs(sum(p) - 1) < 1e-10, 'sum(p) != 1' n = np.random.choice(np.arange(1, int(max_n + 1)), p=p) # we do not replace the last token: it must be a period! assert n < tgt_len, 'n={}, tgt_len={}'.format(n, tgt_len) idx = np.random.randint(tgt_len - n) ngram = tgt_sent[idx: idx + n] new_ngram = get_new_ngram(ngram, n, tgt_vocab) sampled_tgt_sent = list(tgt_sent) sampled_tgt_sent[idx: idx + n] = new_ngram tgt_samples.append(sampled_tgt_sent) # compute bleu scores and rank the samples by bleu scores bleu_scores = [] for tgt_sample in tgt_samples: bleu_score = sentence_bleu([tgt_sent], tgt_sample) bleu_scores.append(bleu_score) tgt_ranks = sorted(range(len(tgt_samples)), key=lambda i: bleu_scores[i], reverse=True) # convert list of tokens into a string tgt_samples = [' '.join(tgt_sample) for tgt_sample in tgt_samples] print('*' * 50, file=f_out) print('source: ' + src_sent, file=f_out) print('%d samples' % len(tgt_samples), file=f_out) for i in tgt_ranks: print('%s ||| %f' % (tgt_samples[i], bleu_scores[i]), file=f_out) print('*' * 50, file=f_out) f_out.close()
def sample(args): train_data_src = read_corpus(args.src_file, source='src') train_data_tgt = read_corpus(args.tgt_file, source='tgt') train_data = zip(train_data_src, train_data_tgt) # load model params print('load model from [%s]' % args.model_bin, file=sys.stderr) params = torch.load(args.model_bin, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] # build model model = NMT(opt, vocab) model.load_state_dict(state_dict) model.eval() model = model.cuda() # sampling print('begin sampling') train_iter = cum_samples = 0 for src_sents, tgt_sents in data_iter(train_data, batch_size=1): train_iter += 1 samples = model.sample(src_sents, sample_size=5, to_word=True) cum_samples += sum(len(sample) for sample in samples) for i, tgt_sent in enumerate(tgt_sents): print('*' * 80) print('target:' + ' '.join(tgt_sent)) tgt_samples = samples[i] print('samples:') for sid, sample in enumerate(tgt_samples, 1): print('[%d] %s' % (sid, ' '.join(sample[1:-1]))) print('*' * 80)
def test_data_generator(self, batch_size=64): self.dev_src = "data/nmt_iwslt/test.de-en.de" self.dev_tgt = "data/nmt_iwslt/test.de-en.en" eval_data_src = read_corpus(self.dev_src, source='src') eval_data_tgt = read_corpus(self.dev_tgt, source='tgt') return data_iter(zip(eval_data_src, eval_data_tgt), batch_size=batch_size)
def train_data_generator(self, batch_size=64): self.train_src = "data/nmt_iwslt/train.de-en.de.wmixerprep" self.train_tgt = "data/nmt_iwslt/train.de-en.en.wmixerprep" train_data_src = read_corpus(self.train_src, source='src') train_data_tgt = read_corpus(self.train_tgt, source='tgt') return data_iter(zip(train_data_src, train_data_tgt), batch_size=batch_size)
def test(args): test_data_src = read_corpus(args.test_src, source='src') test_data_tgt = read_corpus(args.test_tgt, source='tgt') test_data = list(zip(test_data_src, test_data_tgt)) if args.load_model: print('load model from [%s]' % args.load_model) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] saved_args = params['args'] state_dict = params['state_dict'] model = NMT(saved_args, vocab) model.load_state_dict(state_dict) else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.eval() if args.cuda: # model = nn.DataParallel(model).cuda() model = model.cuda() hypotheses = decode(model, test_data) top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = get_bleu([tgt for src, tgt in test_data], top_hypotheses) word_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses, 'word_acc') sent_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses, 'sent_acc') print('Corpus Level BLEU: %f, word level acc: %f, sentence level acc: %f' % (bleu_score, word_acc, sent_acc), file=sys.stderr) if args.save_to_file: print('save decoding results to %s' % args.save_to_file) with open(args.save_to_file, 'w') as f: for hyps in hypotheses: f.write(' '.join(hyps[0][1:-1]) + '\n') if args.save_nbest: nbest_file = args.save_to_file + '.nbest' print('save nbest decoding results to %s' % nbest_file) with open(nbest_file, 'w') as f: for src_sent, tgt_sent, hyps in zip(test_data_src, test_data_tgt, hypotheses): print('Source: %s' % ' '.join(src_sent), file=f) print('Target: %s' % ' '.join(tgt_sent), file=f) print('Hypotheses:', file=f) for i, hyp in enumerate(hyps, 1): print('[%d] %s' % (i, ' '.join(hyp)), file=f) print('*' * 30, file=f)
def sample_ngram(args): src_sents = read_corpus(args.src, 'src') tgt_sents = read_corpus(args.tgt, 'src') # do not read in <s> and </s> f_out = open(args.output, 'w') vocab = torch.load(args.vocab) tgt_vocab = vocab.tgt for src_sent, tgt_sent in zip(src_sents, tgt_sents): src_sent = ' '.join(src_sent) tgt_len = len(tgt_sent) tgt_samples = [] # generate 100 samples # append itself tgt_samples.append(tgt_sent) for sid in xrange(args.sample_size - 1): n = np.random.randint(1, min( tgt_len, 5)) # we do not replace the last token: it must be a period! idx = np.random.randint(tgt_len - n) ngram = tgt_sent[idx:idx + n] new_ngram = get_new_ngram(ngram, n, tgt_vocab) sampled_tgt_sent = list(tgt_sent) sampled_tgt_sent[idx:idx + n] = new_ngram tgt_samples.append(sampled_tgt_sent) # compute bleu scores and rank the samples by bleu scores bleu_scores = [] for tgt_sample in tgt_samples: bleu_score = sentence_bleu([tgt_sent], tgt_sample) bleu_scores.append(bleu_score) tgt_ranks = sorted(range(len(tgt_samples)), key=lambda i: bleu_scores[i], reverse=True) # convert list of tokens into a string tgt_samples = [' '.join(tgt_sample) for tgt_sample in tgt_samples] print('*' * 50, file=f_out) print('source: ' + src_sent, file=f_out) print('%d samples' % len(tgt_samples), file=f_out) for i in tgt_ranks: print('%s ||| %f' % (tgt_samples[i], bleu_scores[i]), file=f_out) print('*' * 50, file=f_out) f_out.close()
def sample(args): train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') train_data = zip(train_data_src, train_data_tgt) if args.load_model: print('load model from [%s]' % args.load_model) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] model = NMT(opt, vocab) model.load_state_dict(state_dict) else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.eval() if args.cuda: # model = nn.DataParallel(model).cuda() model = model.cuda() print('begin sampling') check_every = 10 train_iter = cum_samples = 0 train_time = time.time() for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size): train_iter += 1 samples = model.sample(src_sents, sample_size=args.sample_size, to_word=True) cum_samples += sum(len(sample) for sample in samples) if train_iter % check_every == 0: elapsed = time.time() - train_time print('sampling speed: %d/s' % (cum_samples / elapsed)) cum_samples = 0 train_time = time.time() for i, tgt_sent in enumerate(tgt_sents): print('*' * 80) print('target:' + ' '.join(tgt_sent)) tgt_samples = samples[i] print('samples:') for sid, sample in enumerate(tgt_samples, 1): print('[%d] %s' % (sid, ' '.join(sample[1:-1]))) print('*' * 80)
def main(): ############################################################################### # Load data ############################################################################### d = util.Dictionary() if args.task == "train": logging.info("Reading train...") trncorpus = util.read_corpus(args.ftrn, d, True) d.freeze() # no new word types allowed vocab_size = d.size() # save dict d.save_dict(fprefix + ".dict") logging.info("Reading dev...") devcorpus = util.read_corpus(args.fdev, d, False) elif args.task == "test": logging.info("Reading test...") d.load_dict(args.fdct) d.freeze() vocab_size = d.size() # load test corpus tstcorpus = util.read_corpus(args.ftst, d, False) ############################################################################### # Build the model ############################################################################### if args.task == "train": model_fname = fprefix + ".model" pretrained_model = None if args.fmod: # load pre-trained model pretrained_model = model.load_model(args.fmod, vocab_size, args.nclass, args.inputdim, args.hiddendim, args.nlayer, args.droprate) logging.info("Successfully loaded pretrained model.") trained = model.train(trncorpus, devcorpus, vocab_size, args.nclass, args.inputdim, args.hiddendim, args.nlayer, args.trainer, args.lr, args.droprate, args.niter, args.logfreq, args.verbose, model_fname, pretrained_model) dev_accuracy = model.evaluate(trained, devcorpus.docs) logging.info("Final Accuracy on dev: %s", dev_accuracy) model.save_model(trained, model_fname) else: trained_model = model.load_model(args.fmod, vocab_size, args.nclass, args.inputdim, args.hiddendim, args.nlayer, args.droprate) tst_accuracy = model.evaluate(trained_model, tstcorpus.docs) logging.info("Final Accuracy on test: %s", tst_accuracy)
def sample_from_hamming_distance_payoff_distribution(args): src_sents = read_corpus(args.src, 'src') tgt_sents = read_corpus(args.tgt, 'src') # do not read in <s> and </s> f_out = open(args.output, 'w') vocab = torch.load(args.vocab) tgt_vocab = vocab.tgt payoff_prob, Z_qs = generate_hamming_distance_payoff_distribution( max(len(sent) for sent in tgt_sents), vocab_size=len(vocab.tgt), tau=args.temp) for src_sent, tgt_sent in zip(src_sents, tgt_sents): tgt_samples = [] # make sure the ground truth y* is in the samples tgt_sent_len = len( tgt_sent) - 3 # remove <s> and </s> and ending period . tgt_ref_tokens = tgt_sent[1:-1] bleu_scores = [] # sample an edit distances e_samples = np.random.choice(range(tgt_sent_len + 1), p=payoff_prob[tgt_sent_len], size=args.sample_size, replace=True) for i, e in enumerate(e_samples): if e > 0: # sample a new tgt_sent $y$ old_word_pos = np.random.choice(range(1, tgt_sent_len + 1), size=e, replace=False) new_words = [ vocab.tgt.id2word[wid] for wid in np.random.randint(3, len(vocab.tgt), size=e) ] new_tgt_sent = list(tgt_sent) for pos, word in zip(old_word_pos, new_words): new_tgt_sent[pos] = word bleu_score = sentence_bleu([tgt_ref_tokens], new_tgt_sent[1:-1]) bleu_scores.append(bleu_score) else: new_tgt_sent = list(tgt_sent) bleu_scores.append(1.) # print('y: %s' % ' '.join(new_tgt_sent)) tgt_samples.append(new_tgt_sent)
def load_data(path, split, suffix, skip_gap, feature_path, is_test=False): print('load data from {}'.format(path)) if path is None: return None slist = suffix.split(' ') # src mt src-mt.alignments tags pe ref src_tags features src_sents = read_corpus(path + '/%s.%s' % (split, slist[0])) hyp_sents_orig = read_corpus(path + '/%s.%s' % (split, slist[1]), lowercase=False) hyp_sents = [[w.lower() for w in hyp] for hyp in hyp_sents_orig] align_sents = read_alignment_matrix(path + '/%s.%s' % (split, slist[2]), src_sents, hyp_sents) if is_test: tag_sents = [[1] * len(hyp) for hyp in hyp_sents] else: tag_sents = read_tags(path + '/%s.%s' % (split, slist[3]), skip_gap) baseline_sents = read_baseline_features(feature_path + '/%s.%s' %(split, slist[7])) #baseline_sents = [[0 for h in hyp] for hyp in hyp_sents] return list(zip(src_sents, hyp_sents, align_sents, tag_sents, baseline_sents, hyp_sents_orig))
def train_raml(args): tau = args.temp train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data_src = read_corpus(args.dev_src, source='src') dev_data_tgt = read_corpus(args.dev_tgt, source='tgt') dev_data = list(zip(dev_data_src, dev_data_tgt)) dev_data = dev_data[:args.dev_limit] vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args) if args.raml_sample_mode == 'pre_sample': # dict of (src, [tgt: (sent, prob)]) print('read in raml training data...', file=sys.stderr, end='') begin_time = time.time() raml_samples = read_raml_train_data(args.raml_sample_file, temp=tau) print('done[%d s].' % (time.time() - begin_time)) elif args.raml_sample_mode.startswith('hamming_distance'): print('sample from hamming distance payoff distribution') payoff_prob, Z_qs = generate_hamming_distance_payoff_distribution( max(len(sent) for sent in train_data_tgt), vocab_size=len(vocab.tgt) - 3, tau=tau) train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 report_weighted_loss = cum_weighted_loss = 0 cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0 hist_valid_scores = [] train_time = begin_time = time.time() _info = f""" begin RAML training ・学習:{len(train_data)}ペア, {args.train_log_file} ・テスト:{len(dev_data)}ペア, {args.valid_niter}iter毎 {args.validation_log_file} ・バッチサイズ:{args.batch_size} ・1epoch = {len(train_data)}ペア = {int(len(train_data)/args.batch_size)}iter """ print(_info) log_data = {'args': args} if args.notify_slack: slack.post(f""" {_info} {args} """) # smoothing function for BLEU sm_func = None if args.smooth_bleu: sm_func = SmoothingFunction().method3 with open(args.train_log_file, "w") as train_output, open(args.validation_log_file, "w") as validation_output: while True: epoch += 1 for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size): train_iter += 1 raml_src_sents = [] raml_tgt_sents = [] raml_tgt_weights = [] if args.raml_sample_mode == 'pre_sample': for src_sent in src_sents: sent = ' '.join(src_sent) tgt_samples_all = raml_samples[sent] # print(f'src_sent: "{sent}", target_samples_all: {len(list(tgt_samples_all))}') if args.sample_size >= len(list(tgt_samples_all)): tgt_samples = tgt_samples_all else: tgt_samples_id = np.random.choice( range(1, len(list(tgt_samples_all))), size=args.sample_size - 1, replace=False) tgt_samples = [tgt_samples_all[0]] + [ tgt_samples_all[i] for i in tgt_samples_id ] # make sure the ground truth y* is in the samples raml_src_sents.extend([src_sent] * len(list(tgt_samples))) raml_tgt_sents.extend( [['<s>'] + sent.split(' ') + ['</s>'] for sent, weight in tgt_samples]) raml_tgt_weights.extend( [weight for sent, weight in tgt_samples]) elif args.raml_sample_mode in [ 'hamming_distance', 'hamming_distance_impt_sample' ]: for src_sent, tgt_sent in zip(src_sents, tgt_sents): tgt_samples = [ ] # make sure the ground truth y* is in the samples tgt_sent_len = len( tgt_sent ) - 3 # remove <s> and </s> and ending period . tgt_ref_tokens = tgt_sent[1:-1] bleu_scores = [] # print('y*: %s' % ' '.join(tgt_sent)) # sample an edit distances e_samples = np.random.choice( range(tgt_sent_len + 1), p=payoff_prob[tgt_sent_len], size=args.sample_size, replace=True) # make sure the ground truth y* is in the samples if args.raml_bias_groundtruth and (not 0 in e_samples): e_samples[0] = 0 for i, e in enumerate(e_samples): if e > 0: # sample a new tgt_sent $y$ old_word_pos = np.random.choice(range( 1, tgt_sent_len + 1), size=e, replace=False) new_words = [ vocab.tgt.id2word[wid] for wid in np.random.randint( 3, len(vocab.tgt), size=e) ] new_tgt_sent = list(tgt_sent) for pos, word in zip(old_word_pos, new_words): new_tgt_sent[pos] = word else: new_tgt_sent = list(tgt_sent) # if enable importance sampling, compute bleu score if args.raml_sample_mode == 'hamming_distance_impt_sample': if e > 0: # remove <s> and </s> bleu_score = sentence_bleu( [tgt_ref_tokens], new_tgt_sent[1:-1], smoothing_function=sm_func) bleu_scores.append(bleu_score) else: bleu_scores.append(1.) # print('y: %s' % ' '.join(new_tgt_sent)) tgt_samples.append(new_tgt_sent) # if enable importance sampling, compute importance weight if args.raml_sample_mode == 'hamming_distance_impt_sample': tgt_sample_weights = [ math.exp(bleu_score / tau) / math.exp(-e / tau) for e, bleu_score in zip( e_samples, bleu_scores) ] normalizer = sum(tgt_sample_weights) tgt_sample_weights = [ w / normalizer for w in tgt_sample_weights ] else: tgt_sample_weights = [1.] * args.sample_size raml_src_sents.extend([src_sent] * len(tgt_samples)) raml_tgt_sents.extend(tgt_samples) raml_tgt_weights.extend(tgt_sample_weights) if args.debug: print('*' * 30) print('Target: %s' % ' '.join(tgt_sent)) for tgt_sample, e, bleu_score, weight in zip( tgt_samples, e_samples, bleu_scores, tgt_sample_weights): print( 'Sample: %s ||| e: %d ||| bleu: %f ||| weight: %f' % (' '.join(tgt_sample), e, bleu_score, weight)) print() break src_sents_var = to_input_variable(raml_src_sents, vocab.src, cuda=args.cuda) tgt_sents_var = to_input_variable(raml_tgt_sents, vocab.tgt, cuda=args.cuda) weights_var = Variable(torch.FloatTensor(raml_tgt_weights), requires_grad=False) if args.cuda: weights_var = weights_var.cuda() batch_size = len( raml_src_sents ) # batch_size = args.batch_size * args.sample_size src_sents_len = [len(s) for s in raml_src_sents] pred_tgt_word_num = sum(len( s[1:]) for s in raml_tgt_sents) # omitting leading `<s>` optimizer.zero_grad() # (tgt_sent_len, batch_size, tgt_vocab_size) scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1]) # (tgt_sent_len * batch_size, tgt_vocab_size) log_scores = F.log_softmax(scores.view(-1, scores.size(2))) # remove leading <s> in tgt sent, which is not used as the target flattened_tgt_sents = tgt_sents_var[1:].view(-1) # batch_size * tgt_sent_len tgt_log_scores = torch.gather( log_scores, 1, flattened_tgt_sents.unsqueeze(1)).squeeze(1) unweighted_loss = -tgt_log_scores * ( 1. - torch.eq(flattened_tgt_sents, 0).float()) weighted_loss = unweighted_loss * weights_var.repeat( scores.size(0)) weighted_loss = weighted_loss.sum() weighted_loss_val = weighted_loss.item() nll_loss_val = unweighted_loss.sum().item() # weighted_log_scores = log_scores * weights.view(-1, scores.size(2)) # weighted_loss = nll_loss(weighted_log_scores, flattened_tgt_sents) loss = weighted_loss / batch_size # nll_loss_val = nll_loss(log_scores, flattened_tgt_sents).item() loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.clip_grad) optimizer.step() report_weighted_loss += weighted_loss_val cum_weighted_loss += weighted_loss_val report_loss += nll_loss_val cum_loss += nll_loss_val report_tgt_words += pred_tgt_word_num cum_tgt_words += pred_tgt_word_num report_examples += batch_size cum_examples += batch_size cum_batches += batch_size if train_iter % args.log_every == 0 or train_iter % args.notify_slack_every == 0: _log = 'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % ( epoch, train_iter, report_weighted_loss / report_examples, np.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time) print(_log) print(_log, file=train_output) _list_dict_update( log_data, { 'epoch': epoch, 'train_iter': train_iter, 'loss': report_weighted_loss / report_examples, 'ppl': np.exp(report_loss / report_tgt_words), 'examples': cum_examples, 'speed': report_tgt_words / (time.time() - train_time), 'elapsed': time.time() - begin_time }, 'train') train_time = time.time() report_loss = report_weighted_loss = report_tgt_words = report_examples = 0. if train_iter % args.notify_slack_every == 0 and args.notify_slack: print('post slack') slack.post(_log) # perform validation if train_iter % args.valid_niter == 0: print('epoch %d, iter %d, cum. loss %.2f, ' 'cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_weighted_loss / cum_batches, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_weighted_loss = cum_batches = cum_tgt_words = 0. valid_num += 1 print('begin validation ...') model.eval() # compute dev. ppl and bleu dev_loss = evaluate_loss(model, dev_data, cross_entropy_loss) dev_ppl = np.exp(dev_loss) if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']: dev_hyps = decode(model, dev_data, f=validation_output, verbose=False) dev_hyps = [hyps[0] for hyps in dev_hyps] if args.valid_metric == 'bleu': valid_metric = get_bleu( [tgt for src, tgt in dev_data], dev_hyps) else: valid_metric = get_acc( [tgt for src, tgt in dev_data], dev_hyps, acc_type=args.valid_metric) _log = 'validation: iter %d, dev. ppl %f, dev. %s %f' % ( train_iter, dev_ppl, args.valid_metric, valid_metric) print(_log) print(_log, file=validation_output) if args.notify_slack: slack.post(_log) else: valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) if 'dev_data' in log_data: log_data['dev_data'] = dev_data _list_dict_update(log_data, { 'epoch': epoch, 'train_iter': train_iter, 'loss': dev_loss, 'ppl': dev_ppl, args.valid_metric: valid_metric, 'hyps': dev_hyps, }, 'validation', is_save=True) model.train() is_better = len( hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) is_better_than_last = len( hist_valid_scores ) == 0 or valid_metric > hist_valid_scores[-1] hist_valid_scores.append(valid_metric) if valid_num > args.save_model_after: model_file = args.save_to + '.iter%d.bin' % train_iter print('save model to [%s]' % model_file) model.save(model_file) if (not is_better_than_last) and args.lr_decay: lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('decay learning rate to %f' % lr) optimizer.param_groups[0]['lr'] = lr if is_better: patience = 0 best_model_iter = train_iter if valid_num > args.save_model_after: print('save currently the best model ..') model_file_abs_path = os.path.abspath(model_file) symlin_file_abs_path = os.path.abspath( args.save_to + '.bin') os.system( 'ln -sf %s %s' % (model_file_abs_path, symlin_file_abs_path)) else: patience += 1 print('hit patience %d' % patience) if patience == args.patience: _log = f""" {'hit patience %d' % patience} early stop! {'the best model is from iteration [%d]' % best_model_iter} """ print(_log) if args.notify_slack: slack.post(_log) exit(0) if args.debug: print(f'debug epoch:{epoch} exit') model_file = args.save_to + '.bin' print('save model to [%s]' % model_file) model.save(model_file) exit(0)
def train(args): train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') dev_data_src = read_corpus(args.dev_src, source='src') dev_data_tgt = read_corpus(args.dev_tgt, source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) dev_data = dev_data[:args.dev_limit] vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args) train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0 hist_valid_scores = [] train_time = begin_time = time.time() _info = f""" begin Maximum Likelihood training ・学習:{len(train_data)}ペア, {args.train_log_file} ・テスト:{len(dev_data)}ペア, {args.valid_niter}iter毎 {args.validation_log_file} ・バッチサイズ:{args.batch_size} ・1epoch = {len(train_data)}ペア = {int(len(train_data)/args.batch_size)}iter """ print(_info) if args.notify_slack: slack.post(f""" {_info} {args} """) with open(args.train_log_file, "w") as train_output, open(args.validation_log_file, "w") as validation_output: while True: epoch += 1 for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size): train_iter += 1 src_sents_var = to_input_variable(src_sents, vocab.src, cuda=args.cuda) tgt_sents_var = to_input_variable(tgt_sents, vocab.tgt, cuda=args.cuda) batch_size = len(src_sents) src_sents_len = [len(s) for s in src_sents] pred_tgt_word_num = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` optimizer.zero_grad() # (tgt_sent_len, batch_size, tgt_vocab_size) scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1]) word_loss = cross_entropy_loss(scores.view(-1, scores.size(2)), tgt_sents_var[1:].view(-1)) loss = word_loss / batch_size word_loss_val = word_loss.item() loss_val = loss.item() loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.clip_grad) optimizer.step() report_loss += word_loss_val cum_loss += word_loss_val report_tgt_words += pred_tgt_word_num cum_tgt_words += pred_tgt_word_num report_examples += batch_size cum_examples += batch_size cum_batches += batch_size if train_iter % args.log_every == 0: _log = 'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, np.exp( report_loss / report_tgt_words), cum_examples, report_tgt_words / ( time.time() - train_time), time.time() - begin_time) print(_log) print(_log, file=train_output) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % args.valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_batches, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_batches = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) model.eval() # compute dev. ppl and bleu dev_loss = evaluate_loss(model, dev_data, cross_entropy_loss) dev_ppl = np.exp(dev_loss) if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']: dev_hyps = decode(model, dev_data) dev_hyps = [hyps[0] for hyps in dev_hyps] if args.valid_metric == 'bleu': valid_metric = get_bleu( [tgt for src, tgt in dev_data], dev_hyps) else: valid_metric = get_acc( [tgt for src, tgt in dev_data], dev_hyps, acc_type=args.valid_metric) _log = 'validation: iter %d, dev. ppl %f, dev. %s %f' % ( train_iter, dev_ppl, args.valid_metric, valid_metric) print(_log, file=sys.stderr) print(_log, file=validation_output) if args.notify_slack: slack.post(_log) else: valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) model.train() is_better = len( hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) is_better_than_last = len( hist_valid_scores ) == 0 or valid_metric > hist_valid_scores[-1] hist_valid_scores.append(valid_metric) if valid_num > args.save_model_after: model_file = args.save_to + '.iter%d.bin' % train_iter print('save model to [%s]' % model_file, file=sys.stderr) model.save(model_file) if (not is_better_than_last) and args.lr_decay: lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('decay learning rate to %f' % lr, file=sys.stderr) optimizer.param_groups[0]['lr'] = lr if is_better: patience = 0 best_model_iter = train_iter if valid_num > args.save_model_after: print('save currently the best model ..', file=sys.stderr) model_file_abs_path = os.path.abspath(model_file) symlin_file_abs_path = os.path.abspath( args.save_to + '.bin') os.system( 'ln -sf %s %s' % (model_file_abs_path, symlin_file_abs_path)) else: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: print('early stop!', file=sys.stderr) print('the best model is from iteration [%d]' % best_model_iter, file=sys.stderr) exit(0)
def compute_lm_prob(args): """ given source-target sentence pairs, compute ppl and log-likelihood """ test_data_src = read_corpus(args.test_src, source='src') test_data_tgt = read_corpus(args.test_tgt, source='tgt') test_data = zip(test_data_src, test_data_tgt) if args.load_model: print('load model from [%s]' % args.load_model) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] saved_args = params['args'] state_dict = params['state_dict'] model = NMT(saved_args, vocab) model.load_state_dict(state_dict) else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.eval() if args.cuda: # model = nn.DataParallel(model).cuda() model = model.cuda() f = open(args.save_to_file, 'w') for src_sent, tgt_sent in test_data: src_sents = [src_sent] tgt_sents = [tgt_sent] batch_size = len(src_sents) src_sents_len = [len(s) for s in src_sents] pred_tgt_word_nums = [len(s[1:]) for s in tgt_sents] # omitting leading `<s>` # (sent_len, batch_size) src_sents_var = to_input_variable(src_sents, model.vocab.src, cuda=args.cuda, is_test=True) tgt_sents_var = to_input_variable(tgt_sents, model.vocab.tgt, cuda=args.cuda, is_test=True) # (tgt_sent_len, batch_size, tgt_vocab_size) scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1]) # (tgt_sent_len * batch_size, tgt_vocab_size) log_scores = F.log_softmax(scores.view(-1, scores.size(2))) # remove leading <s> in tgt sent, which is not used as the target # (batch_size * tgt_sent_len) flattened_tgt_sents = tgt_sents_var[1:].view(-1) # (batch_size * tgt_sent_len) tgt_log_scores = torch.gather( log_scores, 1, flattened_tgt_sents.unsqueeze(1)).squeeze(1) # 0-index is the <pad> symbol tgt_log_scores = tgt_log_scores * ( 1. - torch.eq(flattened_tgt_sents, 0).float()) # (tgt_sent_len, batch_size) tgt_log_scores = tgt_log_scores.view(-1, batch_size) # .permute(1, 0) # (batch_size) tgt_sent_scores = tgt_log_scores.sum(dim=0).squeeze() tgt_sent_word_scores = [ tgt_sent_scores[i].item() / pred_tgt_word_nums[i] for i in range(batch_size) ] for src_sent, tgt_sent, score in zip(src_sents, tgt_sents, tgt_sent_word_scores): f.write('%s ||| %s ||| %f\n' % (' '.join(src_sent), ' '.join(tgt_sent), score)) f.close()
'--train_src', type=str, help='path to the source side of the training sentences') parser.add_argument( '--train_trg', type=str, help='path to the target side of the training sentences') parser.add_argument('--output', default='vocab.bin', type=str, help='output vocabulary file') parser.add_argument('--share_vocab', action='store_true', default=False) args = parser.parse_args() print('read in parallel sentences: %s' % args.train_bitext) src_sents = read_corpus(args.train_src, source='src') trg_sents = read_corpus(args.train_trg, source='src') vocab = Vocab(src_sents, trg_sents, args.src_vocab_size, args.trg_vocab_size, remove_singleton=not args.include_singleton, share_vocab=args.share_vocab) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.trg))) torch.save(vocab, args.output) print('vocabulary saved to %s' % args.output)
required=True, help='file of source sentences') parser.add_argument('--train_tgt', type=str, required=True, help='file of target sentences') parser.add_argument('--output', default='vocab.bin', type=str, help='output vocabulary file') args = parser.parse_args() print('read in source sentences: %s' % args.train_src) print('read in target sentences: %s' % args.train_tgt) src_sents = read_corpus(args.train_src, source='src') tgt_sents = read_corpus(args.train_tgt, source='tgt') vocab = Vocab(src_sents, tgt_sents, args.src_vocab_size, args.tgt_vocab_size, remove_singleton=not args.include_singleton) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) torch.save(vocab, args.output) print('vocabulary saved to %s' % args.output)
def train_raml(args): vocab = torch.load(args.vocab) train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') train_data = zip(train_data_src, train_data_tgt) dev_data_src = read_corpus(args.dev_src, source='src') dev_data_tgt = read_corpus(args.dev_tgt, source='tgt') dev_data = zip(dev_data_src, dev_data_tgt) # dict of (src, [tgt: (sent, prob)]) print('read in raml training data...', file=sys.stderr, end='') begin_time = time.time() raml_samples = read_raml_train_data(args.raml_sample_file, temp=args.temp) print('done[%d s].' % (time.time() - begin_time)) vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args) train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 report_weighted_loss = cum_weighted_loss = 0 cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin RAML training') while True: epoch += 1 for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size): train_iter += 1 raml_src_sents = [] raml_tgt_sents = [] raml_tgt_weights = [] for src_sent in src_sents: tgt_samples_all = raml_samples[' '.join(src_sent)] if args.sample_size >= len(tgt_samples_all): tgt_samples = tgt_samples_all else: tgt_samples_id = np.random.choice(range(1, len(tgt_samples_all)), size=args.sample_size - 1, replace=False) tgt_samples = [tgt_samples_all[0]] + [tgt_samples_all[i] for i in tgt_samples_id] # make sure the ground truth y* is in the samples raml_src_sents.extend([src_sent] * len(tgt_samples)) raml_tgt_sents.extend([['<s>'] + sent.split(' ') + ['</s>'] for sent, weight in tgt_samples]) raml_tgt_weights.extend([weight for sent, weight in tgt_samples]) src_sents_var = to_input_variable(raml_src_sents, vocab.src, cuda=args.cuda) tgt_sents_var = to_input_variable(raml_tgt_sents, vocab.tgt, cuda=args.cuda) weights_var = Variable(torch.FloatTensor(raml_tgt_weights), requires_grad=False) if args.cuda: weights_var = weights_var.cuda() batch_size = len(raml_src_sents) # batch_size = args.batch_size * args.sample_size src_sents_len = [len(s) for s in raml_src_sents] pred_tgt_word_num = sum(len(s[1:]) for s in raml_tgt_sents) # omitting leading `<s>` optimizer.zero_grad() # (tgt_sent_len, batch_size, tgt_vocab_size) scores = model(src_sents_var, src_sents_len, tgt_sents_var[:-1]) log_scores = F.log_softmax(scores.view(-1, scores.size(2))) # weights = weights_var.view(1, weights_var.size(0), 1).expand_as(scores).contiguous() flattened_tgt_sents = tgt_sents_var[1:].view(-1) # batch_size * tgt_sent_len tgt_log_scores = torch.gather(log_scores, 1, flattened_tgt_sents.unsqueeze(1)).squeeze(1) unweighted_loss = -tgt_log_scores * (1. - torch.eq(flattened_tgt_sents, 0).float()) weighted_loss = unweighted_loss * weights_var.repeat(scores.size(0)) weighted_loss = weighted_loss.sum() weighted_loss_val = weighted_loss.data[0] nll_loss_val = unweighted_loss.sum().data[0] # weighted_log_scores = log_scores * weights.view(-1, scores.size(2)) # weighted_loss = nll_loss(weighted_log_scores, flattened_tgt_sents) loss = weighted_loss / batch_size # nll_loss_val = nll_loss(log_scores, flattened_tgt_sents).data[0] loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad) optimizer.step() report_weighted_loss += weighted_loss_val cum_weighted_loss += weighted_loss_val report_loss += nll_loss_val cum_loss += nll_loss_val report_tgt_words += pred_tgt_word_num cum_tgt_words += pred_tgt_word_num report_examples += batch_size cum_examples += batch_size cum_batches += batch_size if train_iter % args.log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, ' 'avg. ppl %.2f cum. examples %d, ' 'speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_weighted_loss / report_examples, np.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_weighted_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % args.valid_niter == 0: print('epoch %d, iter %d, cum. loss %.2f, ' 'cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_weighted_loss / cum_batches, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_weighted_loss = cum_batches = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) model.eval() # compute dev. ppl and bleu dev_loss = evaluate_loss(model, dev_data, cross_entropy_loss) dev_ppl = np.exp(dev_loss) if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']: dev_hyps = decode(model, dev_data) dev_hyps = [hyps[0] for hyps in dev_hyps] if args.valid_metric == 'bleu': valid_metric = get_bleu([tgt for src, tgt in dev_data], dev_hyps) else: valid_metric = get_acc([tgt for src, tgt in dev_data], dev_hyps, acc_type=args.valid_metric) print('validation: iter %d, dev. ppl %f, dev. %s %f' % ( train_iter, dev_ppl, args.valid_metric, valid_metric), file=sys.stderr) else: valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) model.train() is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) is_better_than_last = len(hist_valid_scores) == 0 or valid_metric > hist_valid_scores[-1] hist_valid_scores.append(valid_metric) if valid_num > args.save_model_after: model_file = args.save_to + '.iter%d.bin' % train_iter print('save model to [%s]' % model_file, file=sys.stderr) model.save(model_file) if (not is_better_than_last) and args.lr_decay: lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('decay learning rate to %f' % lr, file=sys.stderr) optimizer.param_groups[0]['lr'] = lr if is_better: patience = 0 best_model_iter = train_iter if valid_num > args.save_model_after: print('save currently the best model ..', file=sys.stderr) model_file_abs_path = os.path.abspath(model_file) symlin_file_abs_path = os.path.abspath(args.save_to + '.bin') os.system('ln -sf %s %s' % (model_file_abs_path, symlin_file_abs_path)) else: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: print('early stop!', file=sys.stderr) print('the best model is from iteration [%d]' % best_model_iter, file=sys.stderr) exit(0)
eval_tgt = tgt[int(len(tgt) * train_ratio):int(len(tgt) * (train_ratio + eval_ratio))] test_tgt = tgt[int(len(tgt) * (train_ratio + eval_ratio)):] write_corpus(os.path.join("data", "train_" + src_file), train_src) write_corpus(os.path.join("data", "eval_" + src_file), eval_src) write_corpus(os.path.join("data", "test_" + src_file), test_src) write_corpus(os.path.join("data", "train_" + tgt_file), train_tgt) write_corpus(os.path.join("data", "eval_" + tgt_file), eval_tgt) write_corpus(os.path.join("data", "test_" + tgt_file), test_tgt) return if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--train_src', type=str, default='data/news-commentary-v11.de-en.en', help='file of source sentences') parser.add_argument('--train_tgt', type=str, default='data/news-commentary-v11.de-en.de', help='file of target sentences') parser.add_argument('--src_size', default=20000, type=int, help='source vocabulary size') parser.add_argument('--tgt_size', default=20000, type=int, help='target vocabulary size') args = parser.parse_args() print('read in source sentences: %s' % args.train_src) print('read in target sentences: %s' % args.train_tgt) src_sents = read_corpus(args.train_src, source='src', generate=True)[:args.src_size] tgt_sents = read_corpus(args.train_tgt, source='tgt', generate=True)[:args.tgt_size] assert len(src_sents) == len(tgt_sents) make_data(src_sents, tgt_sents, args.train_src.split("/")[-1], args.train_tgt.split("/")[-1])
def validate_output_file(hypo_f, refer_f): hypo_data = read_corpus(hypo_f, "src") refer_data = read_corpus(refer_f, "tar") return compute_bleu_for_sentences(refer_data, hypo_data)
def train(args): train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') dev_data_src = read_corpus(args.dev_src, source='src') dev_data_tgt = read_corpus(args.dev_tgt, source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) vocab, model, optimizer, nll_loss, cross_entropy_loss = init_training(args) train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = cum_batches = report_examples = epoch = valid_num = best_model_iter = 0 if args.load_model: import re train_iter = int(re.search('(?<=iter)\d+', args.load_model).group(0)) print('start from train_iter = %d' % train_iter) valid_num = train_iter // args.valid_niter hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 print('start of epoch {:d}'.format(epoch)) for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size): train_iter += 1 src_sents_var = to_input_variable(src_sents, vocab.src, cuda=args.cuda) tgt_sents_var = to_input_variable(tgt_sents, vocab.tgt, cuda=args.cuda) # src_sents_var = to_input_variable(src_sents, vocab.src, cuda=False) # tgt_sents_var = to_input_variable(tgt_sents, vocab.tgt, cuda=False) batch_size = len(src_sents) src_sents_len = [len(s) for s in src_sents] pred_tgt_word_num = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` optimizer.zero_grad() # (tgt_sent_len, batch_size, tgt_vocab_size) scores, _ = model(src_sents_var, src_sents_len, tgt_sents_var[:-1]) # if args.cuda: # tgt_sents_var = tgt_sents_var.cuda() word_loss = cross_entropy_loss(scores.view(-1, scores.size(2)), tgt_sents_var[1:].view(-1)) loss = word_loss / batch_size word_loss_val = word_loss.data[0] loss_val = loss.data[0] loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad) optimizer.step() report_loss += word_loss_val cum_loss += word_loss_val report_tgt_words += pred_tgt_word_num cum_tgt_words += pred_tgt_word_num report_examples += batch_size cum_examples += batch_size cum_batches += batch_size if train_iter % args.log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, np.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % args.valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_batches, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_batches = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) model.eval() # compute dev. ppl and bleu dev_loss = evaluate_loss(model, dev_data, cross_entropy_loss) dev_ppl = np.exp(dev_loss) if args.valid_metric in ['bleu', 'word_acc', 'sent_acc']: dev_hyps = decode(model, dev_data) dev_hyps = [hyps[0] for hyps in dev_hyps] print(dev_hyps[:3]) if args.valid_metric == 'bleu': valid_metric = get_bleu([tgt for src, tgt in dev_data], dev_hyps, 'valid') else: valid_metric = get_acc([tgt for src, tgt in dev_data], dev_hyps, acc_type=args.valid_metric) print( 'validation: iter %d, dev. ppl %f, dev. %s %f' % (train_iter, dev_ppl, args.valid_metric, valid_metric), file=sys.stderr) else: valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) model.train() is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) is_better_than_last = len( hist_valid_scores ) == 0 or valid_metric > hist_valid_scores[-1] hist_valid_scores.append(valid_metric) if (not is_better_than_last) and args.lr_decay and epoch > 10: lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('decay learning rate to %f' % lr, file=sys.stderr) optimizer.param_groups[0]['lr'] = lr if is_better: patience = 0 best_model_iter = train_iter if valid_num > args.save_model_after: model_file = args.save_to + '.iter%d.bin' % train_iter print('save model to [%s]' % model_file, file=sys.stderr) model.save(model_file) # print('save currently the best model ..', file=sys.stderr) # model_file_abs_path = os.path.abspath(model_file) # symlin_file_abs_path = os.path.abspath(args.save_to + '.bin') # os.system('ln -sf %s %s' % (model_file_abs_path, symlin_file_abs_path)) else: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: print('early stop!', file=sys.stderr) print('the best model is from iteration [%d]' % best_model_iter, file=sys.stderr) exit(0) if abs(optimizer.param_groups[0]['lr'] - 0.0) <= 1e-5: print('stop! because lr is too small', file=sys.stderr) print('the best model is from iteration [%d]' % best_model_iter, file=sys.stderr) exit(0)
def sample_ngram(args): src_sents = read_corpus(args.src, 'src') tgt_sents = read_corpus(args.tgt, 'src') # do not read in <s> and </s> f_out = open(args.output, 'w') vocab = torch.load(args.vocab) tgt_vocab = vocab.tgt smooth_bleu = args.smooth_bleu sm_func = None if smooth_bleu: sm_func = SmoothingFunction().method3 for src_sent, tgt_sent in zip(src_sents, tgt_sents): src_sent = ' '.join(src_sent) tgt_len = len(tgt_sent) tgt_samples = [] tgt_samples_distort_rates = [] # how many unigrams are replaced # generate 100 samples # append itself tgt_samples.append(tgt_sent) tgt_samples_distort_rates.append(0) for sid in range(args.sample_size - 1): n = np.random.randint(1, min(tgt_len, args.max_ngram_size + 1)) # we do not replace the last token: it must be a period! idx = np.random.randint(tgt_len - n) ngram = tgt_sent[idx: idx + n] new_ngram = get_new_ngram(ngram, n, tgt_vocab) sampled_tgt_sent = list(tgt_sent) sampled_tgt_sent[idx: idx + n] = new_ngram # compute the probability of this sample # prob = 1. / args.max_ngram_size * 1. / (tgt_len - 1 + n) * 1 / (len(tgt_vocab) ** n) tgt_samples.append(sampled_tgt_sent) tgt_samples_distort_rates.append(n) # compute bleu scores or edit distances and rank the samples by bleu scores rewards = [] for tgt_sample, tgt_sample_distort_rate in zip(tgt_samples, tgt_samples_distort_rates): if args.reward == 'bleu': reward = sentence_bleu([tgt_sent], tgt_sample, smoothing_function=sm_func) elif args.reward == 'rouge': rouge = Rouge() scores = rouge.get_scores(hyps=[' '.join(tgt_sample).decode('utf-8')], refs=[' '.join(tgt_sent).decode('utf-8')], avg=True) reward = sum([value['f'] for key, value in scores.items()]) else: reward = -tgt_sample_distort_rate rewards.append(reward) tgt_ranks = sorted(range(len(tgt_samples)), key=lambda i: rewards[i], reverse=True) # convert list of tokens into a string tgt_samples = [' '.join(tgt_sample) for tgt_sample in tgt_samples] print('*' * 50, file=f_out) print('source: ' + src_sent, file=f_out) print('%d samples' % len(tgt_samples), file=f_out) for i in tgt_ranks: print('%s ||| %f' % (tgt_samples[i], rewards[i]), file=f_out) print('*' * 50, file=f_out) f_out.close()
parser.add_argument('--train_tgt', type=str, required=True, help='file of target sentences') parser.add_argument('--output', default='vocab.bin', type=str, help='output vocabulary file') args = parser.parse_args() print('read in source sentences: %s' % args.train_src) print('read in target sentences: %s' % args.train_tgt) src_sents = read_corpus(args.train_src, source='src')[:args.src_size] tgt_sents = read_corpus(args.train_tgt, source='tgt')[:args.tgt_size] if len(src_sents) != len(tgt_sents): src_sents = src_sents[:min(len(src_sents), len(tgt_sents))] tgt_sents = tgt_sents[:min(len(src_sents), len(tgt_sents))] vocab = Vocab(src_sents, tgt_sents, args.src_vocab_size, args.tgt_vocab_size, remove_singleton=not args.include_singleton) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) torch.save(vocab, args.output) print('vocabulary saved to %s' % args.output)
help='path to the training feature of the training sentences') parser.add_argument('--output', default='vocab.bin', type=str, help='output vocabulary file') parser.add_argument('--share_vocab', action='store_true', default=False) parser.add_argument('--lowercase', action='store_true', default=False) args = parser.parse_args() print('read in parallel sentences: %s' % args.train_bitext) if args.train_bitext: src_sents, trg_sents = read_bitext(args.train_bitext) else: src_sents = read_corpus(args.train_src, source='src', lowercase=args.lowercase) trg_sents = read_corpus(args.train_trg, source='src', lowercase=args.lowercase) vocab = Vocab(src_sents, trg_sents, args.src_vocab_size, args.trg_vocab_size, remove_singleton=not args.include_singleton, share_vocab=args.share_vocab, pos_file=args.train_feature) print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.trg)))