def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) opts.train_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] # engine = DBEngine(opt.db_file) with codecs.open(opt.source_file, "r", "utf-8") as corpus_file: sql_list = [json.loads(line)['sql'] for line in corpus_file] js_list = table.IO.read_anno_json(opt.anno) prev_best = (None, None) for fn_model in glob.glob(opt.model_path): print(fn_model) print(opt.anno) opt.model = fn_model translator = table.Translator(opt, dummy_opt.__dict__) data = table.IO.TableDataset(js_list, translator.fields, None, False) test_data = table.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=True, sort_within_batch=False) # inference r_list = [] for batch in test_data: r_list += translator.translate(batch) r_list.sort(key=lambda x: x.idx)
def _get_parser(): parser = ArgumentParser(description='train.py') # Construct config opts.config_opts(parser) opts.model_opts(parser) opts.train_opts(parser) return parser
def __init__(self, model, lang, gpu=False, wx=False): self.lang = lang self.is_ip_wx = wx parser = argparse.ArgumentParser( description='transliterate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) self.opt = parser.parse_args() self.trans_dict = dict() self.broken_words = dict() file_path = os.path.dirname(os.path.abspath(__file__)) if self.lang == 'hin': self.to_utf = WXC(order='wx2utf', lang='hin') self.non_alpha = re.compile(u'([^a-zA-Z]+)') self.alpha_letters = set(string.ascii_letters) self.com_abbr = { 'b': ['BI', 'be'], 'd': ['xI', 'xe'], 'g': ['jI'], 'k': ['ke', 'ki', 'kI'], 'h': ['hE', 'hEM'], 'ha': ['hE', 'hEM'], 'n': ['ina', 'ne'], 'm': ['meM', 'mEM'], 'p': ['pe'], 'q': ['kyoM'], 'r': ['Ora', 'ora'], 's': ['isa', 'se'], 'y': ['ye'] } if self.lang == 'eng': self.non_alpha = re.compile(u'([^a-z]+)') self.alpha_letters = set(string.ascii_letters[:26]) with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp: self.com_abbr = {} for line in fp: k, v = line.split() self.com_abbr[k] = v.split('|') dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] if gpu: self.opt.gpu = 0 self.opt.cuda = self.opt.gpu > -1 self.opt.model = model self.opt.n_best = 5 self.opt.lang = lang if self.opt.cuda: torch.cuda.set_device(self.opt.gpu) # Load the model. self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model( self.opt, dummy_opt.__dict__)
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) # File to write scores to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset( fields, opt.data_type, opt.src, opt.tgt, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False, symbol_representation=opt.symbol_representation, revert_targets=opt.revert_targets) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Evaluator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) evaluator = onmt.translate.Evaluator(model, fields, scorer, copy_attn=model_opt.copy_attn, cuda=opt.cuda) # Statistics #counter = count(1) #score_total, words_total = 0, 0 for batch in data_iter: scores = evaluator.evaluate_batch(batch, data) for score in scores: out_file.write(str(score)) out_file.write('\n') out_file.flush()
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) opts.train_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] engine = DBEngine(opt.db_file) with codecs.open(opt.source_file, "r", "utf-8") as corpus_file: sql_list = [json.loads(line)['sql'] for line in corpus_file] js_list = table.IO.read_anno_json(opt.anno) prev_best = (None, None) for fn_model in glob.glob(opt.model_path): opt.model = fn_model translator = Translator(opt, dummy_opt.__dict__) data = table.IO.TableDataset(js_list, translator.fields, None, False) test_data = table.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=True, sort_within_batch=False) # inference if opt.beam_search: print('Using execution guidance for inference.') r_list = [] for batch in test_data: r_list += translator.translate(batch, js_list, sql_list) r_list.sort(key=lambda x: x.idx) assert len(r_list) == len( js_list), 'len(r_list) != len(js_list): {} != {}'.format( len(r_list), len(js_list)) # evaluation for pred, gold, sql_gold in zip(r_list, js_list, sql_list): pred.eval(gold, sql_gold, engine) print('Results:') for metric_name in ('all', 'exe'): c_correct = sum((x.correct[metric_name] for x in r_list)) print('{}: {} / {} = {:.2%}'.format(metric_name, c_correct, len(r_list), c_correct / len(r_list))) if metric_name == 'all' and (prev_best[0] is None or c_correct > prev_best[1]): prev_best = (fn_model, c_correct) if (opt.split == 'dev') and (prev_best[0] is not None): with codecs.open(os.path.join(opt.data_path, 'dev_best.txt'), 'w', encoding='utf-8') as f_out: f_out.write('{}\n'.format(prev_best[0]))
def main(): parser = argparse.ArgumentParser(description='train.py') opts.model_opts(parser) opts.train_opts(parser) opts.data_opts(parser) opts.score_opts(parser) add_md_help_argument(parser) options = parser.parse_args()
def load_opt(): parser = argparse. \ ArgumentParser(description='main.py', formatter_class=argparse. ArgumentDefaultsHelpFormatter) opts.model_opts(parser) opts.train_opts(parser) opt = parser.parse_args() return opt
def main(anno_file_name, col_headers, raw_args=None, verbose=True): parser = argparse.ArgumentParser(description='evaluate.py') opts.translate_opts(parser) opt = parser.parse_args(raw_args) torch.cuda.set_device(opt.gpu) opt.db_file = os.path.join(opt.data_path, '{}.db'.format(opt.split)) opt.pre_word_vecs = os.path.join(opt.data_path, 'embedding') dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) opts.train_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.anno = anno_file_name engine = DBEngine(opt.db_file) js_list = table.IO.read_anno_json(opt.anno) prev_best = (None, None) sql_query = [] for fn_model in glob.glob(opt.model_path): opt.model = fn_model translator = Translator(opt, dummy_opt.__dict__) data = table.IO.TableDataset(js_list, translator.fields, None, False) test_data = table.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=True, sort_within_batch=False) # inference r_list = [] for batch in test_data: r_list += translator.translate(batch) r_list.sort(key=lambda x: x.idx) pred = r_list[-1] sql_pred = { 'agg': pred.agg, 'sel': pred.sel, 'conds': pred.recover_cond_to_gloss(js_list[-1]) } if verbose: print('\n sql_pred: ', sql_pred, '\n') print('\n col_headers: ', col_headers, '\n') sql_query = Query(sql_pred['sel'], sql_pred['agg'], sql_pred['conds']) try: ans_pred = engine.execute_query(js_list[-1]['table_id'], Query.from_dict(sql_pred), lower=True, verbose=verbose) except Exception as e: ans_pred = None return sql_query.get_complete_query(col_headers), ans_pred
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) opts.train_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] js_list = table.IO.read_anno_json(opt.anno, opt) metric_name_list = ['tgt'] prev_best = (None, None) for fn_model in glob.glob(opt.model_path): opt.model = fn_model print(fn_model) print(opt.anno) translator = table.Translator(opt, dummy_opt.__dict__) data = table.IO.TableDataset(js_list, translator.fields, 0, None, False) test_data = table.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=True, sort_within_batch=False) # inference r_list = [] for batch in test_data: r = translator.translate(batch) r_list += r r_list.sort(key=lambda x: x.idx) assert len(r_list) == len( js_list), 'len(r_list) != len(js_list): {} != {}'.format( len(r_list), len(js_list)) # evaluation for pred, gold in zip(r_list, js_list): pred.eval(gold) print('Results:') for metric_name in metric_name_list: c_correct = sum((x.correct[metric_name] for x in r_list)) acc = c_correct / len(r_list) print('{}: {} / {} = {:.2%}'.format(metric_name, c_correct, len(r_list), acc)) if metric_name == 'tgt' and (prev_best[0] is None or acc > prev_best[1]): prev_best = (fn_model, acc) if (opt.split == 'dev') and (prev_best[0] is not None): with codecs.open(os.path.join(opt.root_dir, opt.dataset, 'dev_best.txt'), 'w', encoding='utf-8') as f_out: f_out.write('{}\n'.format(prev_best[0]))
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) opts.train_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] js_list = table.IO.read_anno_json(opt.anno, opt) # metric_name_list = ['tgt'] prev_best = (None, None) # print(opt.model_path) for fn_model in glob.glob(opt.model_path): opt.model = fn_model print(fn_model) with torch.no_grad(): translator = table.Translator(opt, dummy_opt.__dict__) data = table.IO.TableDataset(js_list, translator.fields, 0, None, False) test_data = table.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=True, sort_within_batch=False) # inference r_list = [] for batch in test_data: r = translator.translate(batch) r_list += r r_list.sort(key=lambda x: x.idx) assert len(r_list) == len( js_list), 'len(r_list) != len(js_list): {} != {}'.format( len(r_list), len(js_list)) metric, _ = com_metric(js_list, r_list) if opt.split == 'test': ref_dic, pre_dict = effect_len(js_list, r_list) for i in range(len(ref_dic)): js_list = ref_dic[i] r_list = pre_dict[i] print("the effect of length {}".format(i)) metric, _ = com_metric(js_list, r_list) if prev_best[0] is None or float(metric['Bleu_1']) > prev_best[1]: prev_best = (fn_model, metric['Bleu_1']) if (opt.split == 'dev') and (prev_best[0] is not None): with codecs.open(os.path.join(opt.root_dir, opt.dataset, 'dev_best.txt'), 'w', encoding='utf-8') as f_out: f_out.write('{}\n'.format(prev_best[0]))
def parse_args(): parser = argparse.ArgumentParser( description='umt.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.model_opts(parser) opts.preprocess_opts(parser) opts.train_opts(parser) opt = parser.parse_args() torch.manual_seed(opt.seed) if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size if opt.layers != -1: opt.enc_layers = opt.layers opt.dec_layers = opt.layers opt.brnn = (opt.encoder_type == "brnn") # if opt.seed > 0: random.seed(opt.seed) torch.manual_seed(opt.seed) if torch.cuda.is_available() and not opt.gpuid: print("WARNING: You have a CUDA device, should run with -gpuid 0") if opt.gpuid: cuda.set_device(opt.gpuid[0]) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) if len(opt.gpuid) > 1: sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n") sys.exit(1) # Set up the Crayon logging server. if opt.exp_host != "": from pycrayon import CrayonClient cc = CrayonClient(hostname=opt.exp_host) experiments = cc.get_experiment_names() print(experiments) if opt.exp in experiments: cc.remove_experiment(opt.exp) return opt
def main(): parser = argparse.ArgumentParser(description='train.py') opts.model_opts(parser) opts.train_opts(parser) opts.data_opts(parser) opts.score_opts(parser) options = parser.parse_args() print(options) argfile = options.save_model + '_arg.p' print('Saving arguments in ' + argfile) pickle.dump(options, open(argfile, "wb")) train(options)
def __init__( self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_97.30_ppl_1.41_e7.pt', dynamic_dict=True, attn_debug=True, share_vocab=True, replace_unk=True, verbose=True): #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_78.18_ppl_9.60_e4.pt'): #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_82.37_ppl_6.28_e8.pt'): #def __init__(self, modelfile='/data1/data1/Anirban/structure2text/model_softmax_1_acc_84.10_ppl_2.13_e1.pt'): print('Loading ' + modelfile) parser = argparse.ArgumentParser( description='seq2seq_predict', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) #opt = parser.parse_args() opt, unknown = parser.parse_known_args() print('Unknown arguments ', unknown) opt.dynamic_dict = dynamic_dict opt.attn_debug = attn_debug opt.share_vocab = share_vocab opt.replace_unk = replace_unk opt.verbose = verbose dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) opt.src = 'temp_seq2seq_pred_%f.txt' % time.time() opt.model = modelfile print('Loading seq2seq model...') # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) self.opt = opt self.fields = fields self.model = model self.model_opt = model_opt
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Add in default model arguments, possibly added since training. checkpoint = torch.load(opt.model, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] src_dict = checkpoint['vocab'][1][1] tgt_dict = checkpoint['vocab'][0][1] fields = onmt.IO.load_fields(checkpoint['vocab']) model_opt = checkpoint['opt'] for arg in dummy_opt.__dict__: if arg not in model_opt: model_opt.__dict__[arg] = dummy_opt.__dict__[arg] model = onmt.ModelConstructor.make_base_model(model_opt, fields, use_gpu(opt), checkpoint) encoder = model.encoder decoder = model.decoder encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist() decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist() print("Writing source embeddings") write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict, encoder_embeddings) print("Writing target embeddings") write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict, decoder_embeddings) print('... done.') print('Converting model...')
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Add in default model arguments, possibly added since training. checkpoint = torch.load(opt.model, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] src_dict = checkpoint['vocab'][1][1] tgt_dict = checkpoint['vocab'][0][1] fields = onmt.io.load_fields_from_vocab(checkpoint['vocab']) model_opt = checkpoint['opt'] for arg in dummy_opt.__dict__: if arg not in model_opt: model_opt.__dict__[arg] = dummy_opt.__dict__[arg] model = onmt.ModelConstructor.make_base_model( model_opt, fields, use_gpu(opt), checkpoint) encoder = model.encoder decoder = model.decoder encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist() decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist() print("Writing source embeddings") write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict, encoder_embeddings) print("Writing target embeddings") write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict, decoder_embeddings) print('... done.') print('Converting model...')
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # N = 1 # M = 1 # src_file = open(opt.train_src, 'r') # trg_file = open(opt.train_tgt, 'r') # # align_file = open(opt.train_align, 'r') # # src_lines = src_file.readlines() # trg_lines = trg_file.readlines() # align_lines = align_file.readlines() # # align = OrderedDict() # pool = Pool() # result = [] # for i in range(10000): # result.append(pool.apply_async(func, args=( # i, N, M, src_lines[125 * i:125 * (i + 1)], trg_lines[125 * i:125 * (i + 1)], # align_lines[125 * i:125 * (i + 1)]))) # pool.close() # pool.join() # # for i in result: # ddict = i.get() # for k, v in ddict.items(): # if k not in align: # align[k] = v # else: # align[k] = v + align[k] # # align_sorted = sorted(align.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) # print(len(align_sorted)) # # k = 0 # lists = OrderedDict() # for align in align_sorted: # pairs = eval(align[0]) # y = pairs[0] # ngram = pairs[2] # context = pairs[3] # if align[1] > 0: # if str(ngram) not in lists: # value = OrderedDict() # xy = OrderedDict() # xy[y] = align[1] # value[str(context)] = xy # lists[str(ngram)] = value # else: # if str(context) not in lists[str(ngram)]: # xy = OrderedDict() # xy[y] = align[1] # lists[str(ngram)][str(context)] = xy # else: # if y not in lists[str(ngram)][str(context)]: # lists[str(ngram)][str(context)][y] = align[1] # else: # lists[str(ngram)][str(context)][y] = align[1] + lists[str(ngram)][str(context)][y] # k += 1 # print(k) pkl_file = open(opt.lists, 'rb') lists = pickle.load(pkl_file) pkl_file.close() # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset(fields, opt.data_type, opt.src, opt.tgt, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.TestOrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Translator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta, opt.coverage_penalty, opt.length_penalty) translator = onmt.translate.Translator( model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "", min_length=opt.min_length, stepwise_penalty=opt.stepwise_penalty, block_ngram_repeat=opt.block_ngram_repeat, ignore_when_blocking=opt.ignore_when_blocking) builder = onmt.translate.TranslationBuilder(data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 for batch, word_batch in data_iter: batch_data = translator.translate_batch(batch, word_batch, data, lists) translations = builder.from_batch(batch_data) for trans in translations: pred_score_total += trans.pred_scores[0] pred_words_total += len(trans.pred_sents[0]) if opt.tgt: gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) + 1 n_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:opt.n_best] ] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) _report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: _report_score('GOLD', gold_score_total, gold_words_total) if opt.report_bleu: _report_bleu() if opt.report_rouge: _report_rouge() if opt.dump_beam: import json json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
return predictions @app.route('/translate', methods=['POST']) def config(): req = request.get_json() res = [] for s in req: res.append(translate(s)) return jsonify(sum(res, [])) if __name__ == '__main__': opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) for model in opt.model: print("Loading model... " + model) modelopt = copy.copy(opt) modelopt.model = model hash = hash_byname(model) translators[hash] = onmt.Translator(modelopt, dummy_opt.__dict__) app.run(debug=False, host='0.0.0.0', port=8092)
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__, stage1=True) model2 = None if opt.model2 is not None: fields2, model2, model_opt2 = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__, stage1=False) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset(fields, opt.data_type, opt.src1, opt.tgt1, opt.src2, opt.tgt2, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) def sort_minibatch_key(ex): """ Sort using length of source sentences and length of target sentence """ #Needed for packed sequence if hasattr(ex, "tgt1"): return len(ex.src1), len(ex.tgt1) return len(ex.src1) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_key=sort_minibatch_key, sort_within_batch=True, shuffle=False) # Translator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta, opt.coverage_penalty, opt.length_penalty) tgt_plan_map = None if opt.src2 is None: tgt_plan_map = {} for j, entry in enumerate(fields["tgt1"].vocab.itos): if j < 4: tgt_plan_map[j] = j else: tgt_plan_map[j] = int(entry) translator = onmt.translate.Translator( model, model2, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn and tgt_plan_map is None, cuda=opt.cuda, beam_trace=opt.dump_beam != "", min_length=opt.min_length, stepwise_penalty=opt.stepwise_penalty) builder = onmt.translate.TranslationBuilder(data, translator.fields, opt.n_best, opt.replace_unk, has_tgt=False) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 stage1 = opt.stage1 for batch in tqdm(data_iter): # NOTE batch_data = translator.translate_batch(batch, data, stage1) translations = builder.from_batch(batch_data, stage1) for trans in translations: pred_score_total += trans.pred_scores[0] pred_words_total += len(trans.pred_sents[0]) if opt.tgt2: gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) if stage1: n_best_preds = [ " ".join([str(entry) for entry in pred]) for pred in trans.pred_sents[:opt.n_best] ] else: n_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:opt.n_best] ] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) _report_score('PRED', pred_score_total, pred_words_total) if opt.tgt2: _report_score('GOLD', gold_score_total, gold_words_total) if opt.report_bleu: _report_bleu() if opt.report_rouge: _report_rouge() if opt.dump_beam: import json json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): previous_words = None dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] print('dummy_opt: ', dummy_opt) opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 if opt.dump_beam != "": import json translator.initBeamAccum() data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, use_filter_pred=False) test_data = onmt.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) counter = count(1) for batch in test_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src \ = translator.translate(batch, data) pred_score_total += sum(score[0] for score in pred_scores) pred_words_total += sum(len(x[0]) for x in pred_batch) if opt.tgt: gold_score_total += sum(gold_scores) gold_words_total += sum(len(x) for x in batch.tgt[1:]) #davidstap #_, src_lengths = batch.src #encStates, context = translator.model.encoder(src, src_lengths) # z_batch: an iterator over the predictions, their scores, # the gold sentence, its score, and the source sentence for each # sentence in the batch. It has to be zip_longest instead of # plain-old zip because the gold_batch has length 0 if the target # is not included. z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores, (sent.squeeze(1) for sent in src.split(1, dim=1))) for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch: # src_sent is torch.LongTensor #print('type src_sent:',type(src_sent)) n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) words = get_src_words(src_sent, translator.fields["src"].vocab.itos) if previous_words is not None: print('BLEU: ', sentence_bleu([words], previous_words)) print() print('S1:', words) print('S2:', previous_words) #os.write(1, bytes('\nSENT %d: %s\n' % # (sent_number, words), 'UTF-8')) previous_words = words best_pred = n_best_preds[0] #TODO: calculate BLEU score reference (best_pred) and hypothesis (words) #TODO: calculate cosine_similarity (best_pred) and hypothesis (words) #bleu_score = sentence_bleu(best_pred, words) #print('BLEU: ',bleu_score) best_score = pred_score[0] #os.write(1, bytes('PRED %d: %s\n' % # (sent_number, best_pred), 'UTF-8')) #print("PRED SCORE: %.4f" % best_score) # 'words' = input sentence # 'best_pred' = prediction # put source sentence in translator.model.encoder to find context # maybe change data type src? torchtext datatype? #model = NMTModel(encoder, decoder) (see ModelConstructor) src_lengths = len(words.split()) # src(FloatTensor): a sequence of source tensors with # optional feature tensors of size (len x batch). # tgt(FloatTensor): a sequence of target tensors with # optional feature tensors of size (len x batch). # lengths([int]): an array of the src length. # dec_state: A decoder state object #hidden, context = translator.model.encoder(src_sent, src_lengths) #euc_dist(context_r, context_pred) if opt.tgt: tgt_sent = ' '.join(gold_sent) os.write( 1, bytes('GOLD %d: %s\n' % (sent_number, tgt_sent), 'UTF-8')) print("GOLD SCORE: %.4f" % gold_score) if len(n_best_preds) > 1: print('\nBEST HYP:') for score, sent in zip(pred_score, n_best_preds): os.write(1, bytes("[%.4f] %s\n" % (score, sent), 'UTF-8')) report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: report_score('GOLD', gold_score_total, gold_words_total) if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) outF = codecs.open(opt.output, 'w', 'utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 if opt.dump_beam != "": import json translator.initBeamAccum() data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, None) testData = onmt.IO.OrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) index = 0 for batch in testData: predBatch, predScore, goldScore, attn, src \ = translator.translate(batch, data) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if opt.tgt: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 try: # python2 outF.write(" ".join([i.decode('utf-8') for i in predBatch[b][0]]) + '\n') except AttributeError: # python3: can't do .decode on a str object outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if opt.verbose: words = [] for f in src[:, b]: word = translator.fields["src"].vocab.itos[f] if word == onmt.IO.PAD_WORD: break words.append(word) os.write(1, bytes('SENT %d: %s\n' % (count, " ".join(words)), 'UTF-8')) index += 1 print(len(predBatch[b][0])) os.write(1, bytes('\n PRED %d: %s\n' % (count, " ".join(predBatch[b][0])), 'UTF-8')) print("PRED SCORE: %.4f" % predScore[b][0]) if opt.tgt: tgtSent = ' '.join(tgtBatch[b]) os.write(1, bytes('GOLD %d: %s\n' % (count, tgtSent), 'UTF-8')) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): os.write(1, bytes("[%.4f] %s\n" % (predScore[b][n], " ".join(predBatch[b][n])), 'UTF-8')) if opt.attn_debug: print('') for i, w in enumerate(predBatch[b][0]): print(w) _, ids = attn[b][0][i].sort(0, descending=True) for j in ids[:5].tolist(): print("\t%s\t%d\t%3f" % (srcBatch[b][j], j, attn[b][0][i][j])) srcBatch, tgtBatch = [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if opt.tgt: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset(fields, opt.data_type, opt.src, opt.tgt, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Translator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta, opt.coverage_penalty, opt.length_penalty) translator = onmt.translate.Translator( model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "", min_length=opt.min_length, stepwise_penalty=opt.stepwise_penalty) builder = onmt.translate.TranslationBuilder( data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 for batch in data_iter: batch_data = translator.translate_batch(batch, data) translations = builder.from_batch(batch_data) for trans in translations: pred_score_total += trans.pred_scores[0] pred_words_total += len(trans.pred_sents[0]) if opt.tgt: gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) + 1 n_best_preds = [" ".join(pred) for pred in trans.pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) _report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: _report_score('GOLD', gold_score_total, gold_words_total) if opt.report_bleu: _report_bleu() if opt.report_rouge: _report_rouge() if opt.dump_beam: import json json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
import opts import argparse import glob print torch.cuda.is_available() print cuda.device_count() print cuda.current_device() parser = argparse.ArgumentParser( description='train.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # opts.py opts.add_md_help_argument(parser) opts.model_opts(parser) opts.train_opts(parser) opt = parser.parse_args() if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size if opt.layers != -1: opt.enc_layers = opt.layers opt.dec_layers = opt.layers opt.brnn = (opt.encoder_type == "brnn") if opt.seed > 0: random.seed(opt.seed) torch.manual_seed(opt.seed)
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 if opt.dump_beam != "": import json translator.initBeamAccum() data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, use_filter_pred=False) test_data = onmt.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) counter = count(1) for batch in test_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src \ = translator.translate(batch, data) pred_score_total += sum(score[0] for score in pred_scores) pred_words_total += sum(len(x[0]) for x in pred_batch) if opt.tgt: gold_score_total += sum(gold_scores) gold_words_total += sum(len(x) for x in batch.tgt[1:]) # z_batch: an iterator over the predictions, their scores, # the gold sentence, its score, and the source sentence for each # sentence in the batch. It has to be zip_longest instead of # plain-old zip because the gold_batch has length 0 if the target # is not included. z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores, (sent.squeeze(1) for sent in src.split(1, dim=1))) for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch: n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) words = get_src_words(src_sent, translator.fields["src"].vocab.itos) os.write( 1, bytes('\nSENT %d: %s\n' % (sent_number, words), 'UTF-8')) best_pred = n_best_preds[0] best_score = pred_score[0] os.write( 1, bytes('PRED %d: %s\n' % (sent_number, best_pred), 'UTF-8')) print("PRED SCORE: %.4f" % best_score) if opt.tgt: tgt_sent = ' '.join(gold_sent) os.write( 1, bytes('GOLD %d: %s\n' % (sent_number, tgt_sent), 'UTF-8')) print("GOLD SCORE: %.4f" % gold_score) if len(n_best_preds) > 1: print('\nBEST HYP:') for score, sent in zip(pred_score, n_best_preds): os.write(1, bytes("[%.4f] %s\n" % (score, sent), 'UTF-8')) report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: report_score('GOLD', gold_score_total, gold_words_total) if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset(fields, opt.data_type, opt.src, opt.tgt, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Translator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) translator = onmt.translate.Translator( model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "", dump_beam=opt.dump_beam, min_length=opt.min_length, antilm_lambda=opt.antilm_lambda, antilm_eta=opt.antilm_eta, antilm_equal_src=opt.antilm_equal_src, lambda_ADBS=opt.lambda_ADBS, affective_decoding=opt.affective_decoding, k=opt.k, sort_AS=opt.sort_AS, sort_similarity=opt.sort_similarity, penalize_repeats=opt.penalize_repeats) builder = onmt.translate.TranslationBuilder(data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 predictions = [] # Test word embedding # print(model.decoder.embeddings.word_lut.weight.data[200:220, -10:]) # print(model.decoder.embeddings.embedding_copy.weight.data[200:220, -10:]) # Load adj vocab pretrained_adj = None if opt.adj_vocab: print("Loading adj vocab...") pretrained_adj = torch.load(opt.adj_vocab) word_freq = None if opt.weighted_AS: print("Loading unigram frequency...") with open(opt.weighted_AS, "rb") as f: word_freq = np.array(pickle.load(f)) # Load word embedding matrix and VAD embedding matrix, pass them to translate_batch() if opt.save_attn: pred_ids = [] attns = [] indices = [] for batch in data_iter: batch_data = translator.translate_batch(batch, data) if opt.save_attn: pred_ids.append(batch_data["predictions"]) attns.append(batch_data["attention"]) indices.append(batch_data["batch"].indices) # Rerank beams if opt.rerank: batch_data = rerank(model, batch_data) predictions += batch_data["predictions"] translations = builder.from_batch(batch_data) for trans in translations: pred_score_total += trans.pred_scores[0] pred_words_total += len(trans.pred_sents[0]) if opt.tgt: gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) n_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:opt.n_best] ] if opt.display_1: n_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:1] ] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) if opt.save_attn: with open(opt.save_attn + ".pkl", "wb") as f: pickle.dump(attns, f) with open(opt.save_attn + "_predictions" + ".pkl", "wb") as f: pickle.dump(pred_ids, f) with open(opt.save_attn + "_indices" + ".pkl", "wb") as f: pickle.dump(indices, f) pred_score = _report_score('PRED', pred_score_total, pred_words_total) out_file.write(pred_score + "\n") out_file.flush() # Evaluate predictions here metrics = evaluate_predictions(model, pretrained_adj, predictions, word_freq) out_file.write(metrics + "\n") out_file.flush() if opt.tgt: gold_score = _report_score('GOLD', gold_score_total, gold_words_total) if opt.report_bleu: _report_bleu() if opt.report_rouge: _report_rouge() out_file.write(gold_score + "\n") out_file.flush() if opt.dump_beam: import json json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset(fields, opt.src_data_type, opt.tgt_data_type, opt.src, opt.tgt, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". if opt.src_data_type == "trigrams" and opt.tgt_data_type == "words": data_iter = onmt.io.IO.SourceTrigramsOrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) elif opt.src_data_type == "words" and opt.tgt_data_type == "characters": data_iter = onmt.io.IO.TargetCharactersOrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) elif opt.src_data_type == "trigrams" and opt.tgt_data_type == "characters": data_iter = onmt.io.IO.BothCharactersOrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) else: data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Translator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) translator = onmt.translate.Translator(model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "", min_length=opt.min_length) builder = onmt.translate.TranslationBuilder(data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 for batch in data_iter: if opt.tgt_data_type == 'words': batch_data = translator.translate_batch(batch, data) else: batch_data = translator.beam_translate(batch, data) #batch_data = translator.greedy_translate(batch, data) translations = builder.from_batch(batch_data) for trans in translations: pred_score_total += trans.pred_scores[0] if trans.pred_sents == []: trans.pred_sents = [' '] pred_words_total += len(trans.pred_sents[0]) if opt.tgt: gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) if opt.tgt_data_type == 'characters': n_best_preds = [''.join(pred) for pred in trans.pred_sents] output = [] for w in n_best_preds: if w == '$$': output.append(" ") else: #output.append(w[1]) output.append(w) out_file.write(''.join(output)) else: n_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:opt.n_best] ] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) _report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: _report_score('GOLD', gold_score_total, gold_words_total) if opt.report_bleu: _report_bleu() if opt.report_rouge: _report_rouge() if opt.dump_beam: import json json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): dummy_parser = argparse.ArgumentParser(description='train_mm.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # loading checkpoint just to find multimodal model type checkpoint = torch.load(opt.model, map_location=lambda storage, loc: storage) opt.multimodal_model_type = checkpoint['opt'].multimodal_model_type del checkpoint if opt.batch_size > 1: print "Batch size > 1 not implemented! Falling back to batch_size = 1 ..." opt.batch_size = 1 # load test image features test_file = tables.open_file(opt.path_to_test_img_feats, mode='r') if opt.multimodal_model_type in ['imgd', 'imge', 'imgw']: test_img_feats = test_file.root.global_feats[:] elif opt.multimodal_model_type in ['src+img']: test_img_feats = test_file.root.local_feats[:] else: raise Exception("Model type not implemented: %s" % opt.multimodal_model_type) test_file.close() # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) #opt.multimodal_model_type = checkpoint['opt'].multimodal_model_type # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset(fields, opt.data_type, opt.src, opt.tgt, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Translator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) translator = onmt.translate.TranslatorMultimodal( model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "", min_length=opt.min_length, test_img_feats=test_img_feats, multimodal_model_type=opt.multimodal_model_type) builder = onmt.translate.TranslationBuilder(data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 for sent_idx, batch in enumerate(data_iter): batch_data = translator.translate_batch(batch, data, sent_idx) translations = builder.from_batch(batch_data) for trans in translations: pred_score_total += trans.pred_scores[0] pred_words_total += len(trans.pred_sents[0]) if opt.tgt: gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) n_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:opt.n_best] ] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) _report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: _report_score('GOLD', gold_score_total, gold_words_total) if opt.report_bleu: _report_bleu() if opt.report_rouge: _report_rouge() if opt.dump_beam: import json json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) #Creates the translator!!! translator = onmt.Translator(opt, dummy_opt.__dict__) outF = codecs.open(opt.output, 'w', 'utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 count = 0 if opt.dump_beam != "": import json translator.initBeamAccum() #Process the data for the test data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, None, opt.inter) testData = onmt.IO.OrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) if opt.inter != None: inter_act=True else: inter_act=False index = 0 for batch in testData: #I will need to change the translator! predBatch, goldBatch, predScore, goldScore, attn, src \ = translator.translate(batch, data, inter_act) #print((attn[0][0])) #print (predBatch) if opt.save_attention: attn_numpy=attn[0][0].numpy() #print(attn_numpy.T.shape) pickle.dump(attn_numpy.T,open('attention_matrix.pkl','wb')) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if opt.tgt: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in batch.tgt[1:]) for b in range(len(predBatch)): count += 1 try: # python2 (should be the same) for n in range(opt.n_best): outF.write(" ".join([i for i in predBatch[b][n]]) + '\n') except AttributeError: # python3: can't do .decode on a str object for n in range(opt.n_best): outF.write(" ".join(predBatch[b][n]) + '\n') outF.flush() if opt.verbose: words = [] for f in src[:, b]: word = translator.fields["src"].vocab.itos[f] if word == onmt.IO.PAD_WORD: break words.append(word) os.write(1, bytes('\nSENT %d: %s\n' % (count, " ".join(words)), 'UTF-8')) index += 1 os.write(1, bytes('PRED %d: %s\n' % (count, " ".join(predBatch[b][0])), 'UTF-8')) print("PRED SCORE: %.4f" % predScore[b][0]) if opt.tgt: tgtSent = ' '.join(goldBatch[b]) os.write(1, bytes('GOLD %d: %s\n' % (count, tgtSent), 'UTF-8')) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): os.write(1, bytes("[%.4f] %s\n" % (predScore[b][n], " ".join(predBatch[b][n])), 'UTF-8')) reportScore('PRED', predScoreTotal, predWordsTotal) if opt.tgt: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
import copy import unittest import math import torch from torch.autograd import Variable import onmt import onmt.io import opts from onmt.ModelConstructor import make_embeddings, \ make_encoder, make_decoder from onmt.modules import ImageEncoder, AudioEncoder parser = argparse.ArgumentParser(description='train.py') opts.model_opts(parser) opts.train_opts(parser) # -data option is required, but not used in this test, so dummy. opt = parser.parse_known_args(['-data', 'dummy'])[0] class TestModel(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestModel, self).__init__(*args, **kwargs) self.opt = opt # Helper to generate a vocabulary def get_vocab(self):
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset( fields, opt.data_type, opt.src, opt.tgt, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False, symbol_representation=opt.symbol_representation, revert_targets=opt.revert_targets) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Translator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) translator = onmt.translate.Translator(model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "", min_length=opt.min_length) builder = onmt.translate.TranslationBuilder(data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 src_sequence_join_character, tgt_sequence_join_character = get_src_tgt_sequence_join_character( opt.symbol_representation) # convert number of variations that should be created for each input to int # default is n_best hypotheses per input num_variations_for_input = collections.defaultdict(lambda: opt.n_best) if opt.num_variations: for i, v in enumerate(opt.num_variations.split(",")): num_variations_for_input[i] = int(v) print "%s variations per input" % str(num_variations_for_input) input_counter = 0 for batch in data_iter: batch_data = translator.translate_batch(batch, data) translations = builder.from_batch(batch_data) for trans in translations: pred_score_total += trans.pred_scores[0] pred_words_total += len(trans.pred_sents[0]) n_best_pred_scores = trans.pred_scores[:opt.n_best] if opt.tgt: gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) if opt.revert_targets: n_best_preds = [ tgt_sequence_join_character.join(reversed(pred)) for pred in trans.pred_sents[:opt.n_best] ] else: n_best_preds = [ tgt_sequence_join_character.join(pred) for pred in trans.pred_sents[:opt.n_best] ] # provide src if opt.verbose: out_file.write( "input:\t" + src_sequence_join_character.join(trans.src_raw) + "\n") n_best_preds = [ "hyp %d:\t%s" % (i, pred) for i, pred in enumerate(n_best_preds) ] n_best_preds = [ pred.strip().replace("\n", " ") for pred in n_best_preds ] num_variations = num_variations_for_input[input_counter] # fill up if fewer hypotheses were returned by beam search than required number of variations if len(n_best_preds) < num_variations: num_preds_to_add = num_variations - len(n_best_preds) n_best_preds.extend(n_best_preds[i % len(n_best_preds)] for i in range(0, num_preds_to_add)) n_best_pred_scores.extend(n_best_pred_scores[i % len(n_best_preds)] for i in range(0, num_preds_to_add)) if opt.stochastic: preds = np.random.choice(n_best_preds, num_variations, replace=False) else: preds = n_best_preds[:num_variations] if opt.report_individual_scores: preds_and_scores = [] for (pred, pred_score) in zip(preds, n_best_pred_scores): preds_and_scores.append("\t".join( [pred.strip(), str(pred_score)])) preds = n_best_preds_and_scores out_file.write('\n'.join(preds) + "\n") if opt.n_best > 1: out_file.write('\n') if opt.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) input_counter += 1 _report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: _report_score('GOLD', gold_score_total, gold_words_total) if opt.report_bleu: _report_bleu() if opt.report_rouge: _report_rouge() if opt.dump_beam: import json json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
output = [] for rule in app.url_map.iter_rules(): options = {} for arg in rule.arguments: options[arg] = "[{0}]".format(arg) methods = ','.join(rule.methods) url = url_for(rule.endpoint, **options) print(url) output.append(url) return jsonify(' '.join(output)) CORS(app) http_server = WSGIServer((host, port), app) logger.info("Model loaded, serving deepsegment on port %d" % port) http_server.serve_forever() if __name__ == '__main__': opt = opts.model_opts() config = yaml.load(open(opt.config, "r")) config = Namespace(**config, **vars(opt)) device, devices_id = misc_utils.set_cuda(config) config.device = device # stdout_handler = prepare_global_logging(args.serialization_dir, args.file_friendly_logging) start(config, url_root=config.url_root, host=config.ip, port=config.port) # cleanup_global_logging(stdout_handler)
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # Test data data = onmt.io.build_dataset(fields, opt.data_type, opt.src, opt.tgt, src_dir=opt.src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) test_data = onmt.io.OrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) # Translator scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) translator = onmt.translate.Translator(model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_sent_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "") builder = onmt.translate.TranslationBuilder( data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) # Statistics counter = count(1) pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 for batch in test_data: batch_data = translator.translate_batch(batch, data) translations = builder.from_batch(batch_data) for trans in translations: pred_score_total += trans.pred_scores[0] pred_words_total += len(trans.pred_sents[0]) if opt.tgt: gold_score_total += trans.gold_score gold_words_total += len(trans.gold_sent) n_best_preds = [" ".join(pred) for pred in trans.pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) output = trans.log(sent_number) os.write(1, output.encode('utf-8')) def report_score(name, score_total, words_total): print("%s AVG SCORE: %.4f, %s PPL: %.4f" % ( name, score_total / words_total, name, math.exp(-score_total/words_total))) report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: report_score('GOLD', gold_score_total, gold_words_total) if opt.dump_beam: import json json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def translate(src, model, output): parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.translate_opts(parser) opt = parser.parse_known_args([])[0] if opt.batch_size != 1: print("WARNING: -batch_size isn't supported currently, " "we set it to 1 for now!") opt.batch_size = 1 opt.src = src opt.model = model opt.output = output dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') gold_out_file = codecs.open("gold_" + opt.output, 'w', 'utf-8') #print "TRANSLATOR SOURCE VOCAB" #for i in range(len(translator.fields["src"].vocab.itos)): # print i, translator.fields["src"].vocab.itos[i] #print data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, use_filter_pred=False) test_data = onmt.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) counter = count(1) for batch in test_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src \ = translator.translate(batch, data) # z_batch: an iterator over the predictions, their scores, # the gold sentence, its score, and the source sentence for each # sentence in the batch. It has to be zip_longest instead of # plain-old zip because the gold_batch has length 0 if the target # is not included. z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores, (sent.squeeze(1) for sent in src.split(1, dim=1))) for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch: n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() words = get_src_words(src_sent, translator.fields["src"].vocab.itos) #print words gold_out_file.write(words) gold_out_file.write('\n') gold_out_file.flush()
def main(training=False, fields=None, model=None, opt=None, writer=None, step=0, corpus_type="dev", multi_process=False): time = Time() if training: assert fields is not None assert model is not None assert opt is not None model.eval() model.generator.eval() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) out_file = codecs.open( "{}_{}_pred_{}.txt".format(opt.save_model, corpus_type.replace("/", "_"), str(step)), "w", "utf-8") print("Output file: ", out_file.name) copy_attn = opt.copy_attn model_opt = opt else: # Load the model. parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') assert opt.tgt is None data = onmt.io.build_dataset(fields, opt.src, opt.tgt, use_filter_pred=False, ngram=model_opt.ngram) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.translate_batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) output, pred_score_total, pred_words_total = \ translate_single_process(opt, model, fields, data, data_iter, f=out_file) outfile_name = out_file.name if opt.bpe: import subprocess subprocess.check_output("sed 's/\@\@ //g' < {} > {}".format( outfile_name, outfile_name + ".nonbpe"), shell=True) outfile_name = outfile_name + ".nonbpe" if opt.new_bpe: generate_nonbpe(outfile_name) outfile_name = outfile_name + ".nonbpe" # if writer is not None: # ratio_stats.log_tensorboard(writer, step) # _report_score('PRED', pred_score_total, pred_words_total, writer, step, corpus_type) metric = 0 if opt.tgt: # _report_score('GOLD', gold_score_total, gold_words_total, writer, step, corpus_type) if opt.report_single_bleu: metric = _report_single_source_bleu(opt, outfile_name, writer, step, corpus_type) if opt.report_multi_bleu: metric = _report_multi_source_bleu(outfile_name, writer, step, corpus_type) if opt.report_rouge: metric = _report_rouge(opt) # if opt.dump_beam: # import json # json.dump(translator.beam_accum, # codecs.open(opt.dump_beam, 'w', 'utf-8')) time.timeit(task="Translation Testing") return metric