def processLines(self, path_to_file, opt, dummy_opt): opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) words_idx = [] sems_idx = [] with codecs.open(path_to_file, "r", "utf-8") as corpus_file: for line in corpus_file: wordindex, semindex = int(line.split()[0]), int( line.split()[1]) words_idx.append(wordindex) sems_idx.append(semindex) data = onmt.IO.ONMTDataset(path_to_file, None, translator.fields, None) train_data = onmt.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) word_encodings = [] for i, batch in enumerate(train_data): # print batch.__dict__['src'] word_idx = words_idx[i] encodings = translator.encode(batch, data)[word_idx] sample = {'x': encodings.data, 'y': sems_idx[i]} self.dataset.append(sample)
def __init__(self, model_path, gpu_id=1): parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument( '-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-src_img_dir', default="", help='Source image directory') parser.add_argument('-tgt', help='True target sequence (optional)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-max_sent_length', type=int, default=100, help='Maximum sentence length.') parser.add_argument('-replace_unk', action="store_true", help="""Replace the generated UNK tokens with the source token that had highest attention weight. If phrase_table is provided, it will lookup the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table) then it will copy the source token""") parser.add_argument( '-verbose', action="store_true", help='Print scores and predictions for each sentence') parser.add_argument('-attn_debug', action="store_true", help='Print best attn for each word') parser.add_argument('-dump_beam', type=str, default="", help='File to dump beam information to.') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-gpu', type=int, default=-1, help="Device to run on") # options most relevant to summarization parser.add_argument('-dynamic_dict', action='store_true', help="Create dynamic dictionaries") parser.add_argument('-share_vocab', action='store_true', help="Share source and target vocabulary") # Alpha and Beta values for Google Length + Coverage penalty # Described here: https://arxiv.org/pdf/1609.08144.pdf, Section 7 parser.add_argument('-alpha', type=float, default=0.0, help="""Google NMT length penalty parameter (higher = longer generation)""") parser.add_argument('-beta', type=float, default=0.0, help="""Coverage penalty parameter""") opt = parser.parse_args(( '-model %s -src /tmp/a -tgt /tmp/b -output /tmp/c -gpu %d -verbose -beam_size 5 -batch_size 1 -n_best 5 -replace_unk' % (model_path, gpu_id)).split()) # noqa opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) self.translator = onmt.Translator(opt)
def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(self.opt.bpe_codes, 'r', encoding="UTF-8"), self.opt.seprator, None, None) self.translator = onmt.Translator(opt) self.nlp = BosonNLP("NGhNiav2.16134.DvyEDmGzYd2S")
def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"), opt.seprator, None, None) self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() self.translator = onmt.Translator(opt)
def translate(model, src, tgt, src_dict, tgt_dict, beam_size=10): opt.beam_size = beam_size opt.n_best = 1 opt.replace_unk = True def addone(f): for line in f: yield line yield None translator = onmt.Translator(opt, model, src_dict, tgt_dict) srcBatch, tgtBatch = [], [] tgtF = codecs.open(tgt, 'r', 'utf-8') pred_list = [] out_name = 'tmp/' + opt.save_model.split('/')[-1] + '.tmp' out = codecs.open(out_name, 'w', 'utf-8') for line in addone(codecs.open(src, 'r', 'utf-8')): if line is not None: srcTokens = line.split() srcBatch += [srcTokens] tgtTokens = tgtF.readline().split() tgtBatch += [tgtTokens] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break predBatch, _, _ = translator.translate(srcBatch, tgtBatch) pred_list += predBatch srcBatch, tgtBatch = [], [] for pred in pred_list: out.write(' '.join(pred[0]) + '\n') tgtF.close() out.close() bleu_results = subprocess.Popen('perl -X multi-bleu.perl ' + tgt + ' < ' + out_name, stdout=subprocess.PIPE, shell=True).stdout.readline() model.train() model.decoder.attn.applyMask(None) return str(bleu_results)
def translate(): logging.info('Translating ...') opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = codecs.open(opt.output, 'w', encoding='utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 if opt.dump_beam != "": import json translator.initBeamAccum() for line in addone(codecs.open(opt.src, encoding='utf-8')): if line is not None: count += 1 srcTokens = line.split() srcBatch += [srcTokens] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break predBatch, predScore, goldScore = translator.translate( srcBatch, tgtBatch) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) for b in range(len(predBatch)): count += 1 outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if count % 1024 == 0: logging.info('Translated {} sentences'.format(count)) srcBatch, tgtBatch = [], []
def __init__(self, generator, tgt_vocab, smoothing_epsilon, aux_checkpoint): super(NMTKLDivNMTLossCompute, self).__init__(generator, tgt_vocab) self.copy_attn = False weight = torch.ones(len(tgt_vocab)) weight[self.padding_idx] = 0 # standard NLL loss term: self.criterion0 = nn.NLLLoss(weight, size_average=False) # ratio between normal cross entropy and LM cross entropy self.smoothing_epsilon = smoothing_epsilon #initial the aux model # The first argument of onmt.Translator just needs *.model and *.gpu OptModel = namedtuple('OptModel', ['model', 'gpu']) opt_model = OptModel(model=aux_checkpoint, gpu=0) # TODO how do we know it is gpu 0 ??????????? self.translator = onmt.Translator(opt_model, dict()) # # ONLY USED FOR DEBUG self.debugLangModelNLL = nn.NLLLoss(weight, size_average=False)
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = open(opt.output, 'w') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 count = 0 predBatch, predScore = translator.sample(opt.num_pts) #predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) #if tgtF is not None: # goldScoreTotal += sum(goldScore) # goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if opt.verbose: #srcSent = ' '.join(srcBatch[b]) #if translator.tgt_dict.lower: # srcSent = srcSent.lower() #print('SENT %d: %s' % (count, srcSent)) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) print("PRED SCORE: %.4f" % predScore[b][0]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][n]))) print('')
def main(): global translator global args args = parse_arguments() print("Loading responder model.") print(args.model) if os.path.isfile(args.model): print("model file exists") else: print("OH NO MODEL FILE DOESN'T EXIST") args.cuda = args.gpu > -1 if args.cuda: torch.cuda.set_device(args.gpu) translator = onmt.Translator(args) print("Starting service.") hostport = args.listen_host.split(':') if len(hostport) == 2: port = int(hostport[1]) else: port = 5000 options = { 'bind': '{}:{}'.format(hostport[0], port), 'threads': args.threads, 'workers': args.workers } if args.statsd_host: options['statsd_host'] = args.statsd_host if args.prefix_statsd: options['statsd_prefix'] = args.prefix_statsd if args.debug: options['log_level'] = 'DEBUG' GunicornApplication(app, options).run()
def ask(self): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] if opt.dump_beam != "": translator.initBeamAccum() while True: src = input('\n>') src_unk = self.encode(src) srcTokens = src_unk.split() srcBatch += [srcTokens] # at the end of file, check last batch if not src: break # import pudb; pudb.set_trace() predBatch, predScore, goldScore = translator.translate(srcBatch, tgtBatch) if opt.dump_beam: # pprint.pprint(translator.beam_accum) translator.initBeamAccum() pred_unks = [" ".join(predBatch[0][n]) for n in range(opt.n_best)] preds = [self.decode(src, src_unk, pred_unks[n]) for n in range(opt.n_best)] print('\nUNK: {}'.format(src_unk)) for n in range(opt.n_best): print('BEST {}: \n {} \n {}'.format(n+1, pred_unks[n], preds[n])) srcBatch, tgtBatch = [], []
def init_translate_model(opt, dummy_opt): return onmt.Translator(opt, dummy_opt.__dict__)
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = codecs.open(opt.output, 'w', 'utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 tgtF = codecs.open(opt.tgt, 'r', 'utf-8') if opt.tgt else None if opt.dump_beam != "": import json translator.initBeamAccum() for line in addone(codecs.open(opt.src, 'r', 'utf-8')): if line is not None: srcTokens = line.split() srcBatch += [srcTokens] if tgtF: tgtTokens = tgtF.readline().split() if tgtF else None tgtBatch += [tgtTokens] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break predBatch, predScore, goldScore, attn, src \ = translator.translate(srcBatch, tgtBatch) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 try: # python2 outF.write( " ".join([i.decode('utf-8') for i in predBatch[b][0]]) + '\n') except AttributeError: # python3: can't do .decode on a str object outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if opt.verbose: srcSent = ' '.join(srcBatch[b]) if translator.tgt_dict.lower: srcSent = srcSent.lower() os.write(1, bytes('SENT %d: %s\n' % (count, srcSent), 'UTF-8')) os.write( 1, bytes('PRED %d: %s\n' % (count, " ".join(predBatch[b][0])), 'UTF-8')) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: tgtSent = ' '.join(tgtBatch[b]) if translator.tgt_dict.lower: tgtSent = tgtSent.lower() os.write( 1, bytes('GOLD %d: %s\n' % (count, tgtSent), 'UTF-8')) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): os.write( 1, bytes( "[%.4f] %s\n" % (predScore[b][n], " ".join(predBatch[b][n])), 'UTF-8')) if opt.attn_debug: print('') for i, w in enumerate(predBatch[b][0]): print(w) _, ids = attn[b][0][i].sort(0, descending=True) for j in ids[:5].tolist(): print("\t%s\t%d\t%3f" % (srcTokens[j], j, attn[b][0][i][j])) srcBatch, tgtBatch = [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if tgtF: tgtF.close() if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = codecs.open(opt.output, 'w', 'utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, cxtBatch, tgtBatch = [], [], [] count = 0 cxtF = codecs.open(opt.cxt, "r", "utf-8") if opt.cxt else None tgtF = codecs.open(opt.tgt, "r", "utf-8") if opt.tgt else None for line in addone(codecs.open(opt.src, "r", "utf-8")): if line is not None: srcTokens = line.split() if cxtF: cline = cxtF.readline() if cline == '': continue else: cxtTokens = cline.split() if tgtF: tgtTokens = tgtF.readline().split() tgtBatch += [tgtTokens] cxtBatch += [cxtTokens] srcBatch += [srcTokens] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break predBatch, predScore, goldScore = translator.translate( srcBatch, cxtBatch, tgtBatch) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if opt.verbose: srcSent = ' '.join(srcBatch[b]) if translator.tgt_dict.lower: srcSent = srcSent.lower() print('SENT %d: %s' % (count, srcSent)) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: tgtSent = ' '.join(tgtBatch[b]) if translator.tgt_dict.lower: tgtSent = tgtSent.lower() print('GOLD %d: %s ' % (count, tgtSent)) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][n]))) print('') srcBatch, cxtBatch, tgtBatch = [], [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if cxtF: cxtF.close() if tgtF: tgtF.close()
def __init__(self): import sentencepiece as spm opt = OptionContentTransfer() self.model = onmt.Translator(opt) self.tokenizer = spm.SentencePieceProcessor() self.tokenizer.load(opt.path_tokenizer)
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = open(opt.output, 'w') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 tgtF = open(opt.tgt) if opt.tgt else None for line in addone(open(opt.src)): if line is not None: srcTokens = line.split() srcBatch += [srcTokens] if tgtF: tgtTokens = tgtF.readline().split() if tgtF else None tgtBatch += [tgtTokens] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break if opt.dump_input_encoding: # predBatch is a batch_size x rnn_size torch FloatTensors predBatch = translator.dump_input_encoding(srcBatch, tgtBatch) for b in range(len(predBatch)): count += 1 outF.write( '%d ' % count + " ".join([str(fl) for fl in predBatch[b].data.tolist()]) + '\n') outF.flush() else: predBatch, predScore, goldScore = translator.translate( srcBatch, tgtBatch) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if opt.verbose: srcSent = ' '.join(srcBatch[b]) if translator.tgt_dict.lower: srcSent = srcSent.lower() print('SENT %d: %s' % (count, srcSent)) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: tgtSent = ' '.join(tgtBatch[b]) if translator.tgt_dict.lower: tgtSent = tgtSent.lower() print('GOLD %d: %s ' % (count, tgtSent)) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][n]))) print('') reportScore('PRED', predScoreTotal, predWordsTotal) srcBatch, tgtBatch = [], [] if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if tgtF: tgtF.close()
srcTokens = [] org_input = list(raw_input("Input sentence:").decode('utf8')) clean_input = replace_wide_chars(org_input) srcTokens.append(clean_input) while len(srcTokens[0]) > 0: predBatch, _, _ = translator.translate(srcTokens, []) predicted_words = predBatch[0][0] print(''.join(predicted_words)) srcTokens = [] org_input = list(raw_input("Input sentence:").decode('utf8')) clean_input = replace_wide_chars(org_input) srcTokens.append(clean_input) if __name__ == "__main__": opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) sys.stdout.write("Loading model file... ") sys.stdout.flush() translator = onmt.Translator(opt) print("Done.") if opt.src: decode_file(translator) else: print("Decode from prompt, input empty string to terminate. ") decode_stream(translator)
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = open(opt.output, 'w') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 tgtF = open(opt.tgt) if opt.tgt else None for line in open(opt.src): srcTokens = line.split() srcBatch += [srcTokens] if tgtF: tgtTokens = tgtF.readline().split() if tgtF else None tgtBatch += [tgtTokens] if len(srcBatch) < opt.batch_size: continue predBatch, predScore, goldScore = translator.translate( srcBatch, tgtBatch) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 outF.write(" ".join(predBatch[b][0]) + '\n') if opt.verbose: print('SENT %d: %s' % (count, " ".join(srcBatch[b]))) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: print('GOLD %d: %s ' % (count, " ".join(tgtBatch[b]))) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][0]))) print('') srcBatch, tgtBatch = [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if tgtF: tgtF.close()
def main(): opt = parser.parse_args() print(opt) opt.cuda = opt.gpu > -1 onmt.Constants.cudaActivated = opt.cuda if opt.cuda: torch.cuda.set_device(opt.gpu) # Always pick n_best opt.n_best = opt.beam_size if opt.output == "stdout": outF = sys.stdout else: outF = open(opt.output, 'w') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 tgtF = open(opt.tgt) if opt.tgt else None if opt.dump_beam != "": import json translator.initBeamAccum() # here we are trying to inFile = None if(opt.src == "stdin"): inFile = sys.stdin opt.batch_size = 1 else: inFile = open(opt.src) if opt.version == 1.0: translator = onmt.EnsembleTranslator(opt) elif opt.version == 2.0: translator = onmt.Translator(opt) for line in addone(inFile): if line is not None: if opt.input_type == 'word': srcTokens = line.split() srcBatch += [srcTokens] if tgtF: tgtTokens = tgtF.readline().split() if tgtF else None tgtBatch += [tgtTokens] elif opt.input_type == 'char': srcTokens = list(line.strip()) srcBatch += [srcTokens] if tgtF: #~ tgtTokens = tgtF.readline().split() if tgtF else None tgtTokens = list(tgtF.readline().strip()) if tgtF else None tgtBatch += [tgtTokens] else: raise NotImplementedError("Input type unknown") #if len(srcBatch) < opt.batch_size: # print('srcBatch < opt.batch_size') # continue else: # at the end of file, check last batch if len(srcBatch) == 0: break predBatch, predScore, predLength, goldScore, numGoldWords = translator.translate(srcBatch, tgtBatch) if opt.normalize and opt.version == 1.0: predBatch_ = [] predScore_ = [] for bb, ss, ll in zip(predBatch, predScore, predLength): #~ ss_ = [s_/numpy.maximum(1.,len(b_)) for b_,s_,l_ in zip(bb,ss,ll)] ss_ = [lenPenalty(s_, l_, opt.alpha) for b_,s_,l_ in zip(bb,ss,ll)] ss_origin = [(s_, len(b_)) for b_,s_,l_ in zip(bb,ss,ll)] sidx = numpy.argsort(ss_)[::-1] #~ print(ss_, sidx, ss_origin) predBatch_.append([bb[s] for s in sidx]) predScore_.append([ss_[s] for s in sidx]) predBatch = predBatch_ predScore = predScore_ if opt.preferLongestOutputs: sortedPredictions = [] for index, prediction in enumerate(predBatch[0]): sortedPredictions.append((index, len(prediction))) sortedPredictions.sort(key=lambda x: x[1], reverse=True) predBatchCopy = predBatch predScoreCopy = predScore for index, sortedPrediction in enumerate(sortedPredictions): predBatch[0][index] = predBatchCopy[0][sortedPredictions[index][0]] predScore[0][index] = predScoreCopy[0][sortedPredictions[index][0]] predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore).item() goldWordsTotal += numGoldWords for b in range(len(predBatch)): count += 1 bestHyp = getSentenceFromTokens(predBatch[b][0], opt.input_type) if not opt.print_nbest: #~ print(predBatch[b][0]) outF.write(bestHyp + '\n') outF.flush() if opt.verbose: srcSent = getSentenceFromTokens(srcBatch[b], opt.input_type) if translator.tgt_dict.lower: srcSent = srcSent.lower() print('SENT %d: %s' % (count, srcSent)) print('PRED %d: %s' % (count, bestHyp)) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: #~ if opt.input_type == 'word': #~ tgtSent = ' '.join(tgtBatch[b]) #~ elif opt.input_type == 'char': #~ tgtSent = ''.join(tgtBatch[b]) tgtSent = getSentenceFromTokens(tgtBatch[b], opt.input_type) if translator.tgt_dict.lower: tgtSent = tgtSent.lower() print('GOLD %d: %s ' % (count, tgtSent)) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.print_nbest: print('\nBEST HYP:') for n in range(opt.n_best): idx = n sent = getSentenceFromTokens(predBatch[b][idx], opt.input_type) print("[%.4f] %s" % (predScore[b][idx], sent)) print('') srcBatch, tgtBatch = [], [] if opt.verbose: reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if tgtF: tgtF.close() if opt.dump_beam: json.dump(translator.beam_accum, open(opt.dump_beam, 'w'))
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 print(opt) if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = codecs.open(opt.output, 'w', 'utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 total_time = 0.0 nsamples = 0.0 tgtF = open(opt.tgt) if opt.tgt else None for line in addone(codecs.open(opt.src, "r", "utf-8")): if line is not None: srcTokens = line.split() srcBatch += [srcTokens] if tgtF: tgtTokens = tgtF.readline().split() if tgtF else None tgtBatch += [tgtTokens] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break start_time = time.time() predBatch, predScore, goldScore = translator.translate( srcBatch, tgtBatch) total_time += (time.time() - start_time) nsamples += len(predBatch) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if opt.verbose: srcSent = ' '.join(srcBatch[b]) if translator.tgt_dict.lower: srcSent = srcSent.lower() print('SENT %d: %s' % (count, srcSent)) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: tgtSent = ' '.join(tgtBatch[b]) if translator.tgt_dict.lower: tgtSent = tgtSent.lower() print('GOLD %d: %s ' % (count, tgtSent)) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][n]))) print('') srcBatch, tgtBatch = [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if tgtF: tgtF.close() samples_per_sec = nsamples / total_time print("Average samples per second: %f, %f, %f" % (nsamples, total_time, samples_per_sec)) print("Time per sample %f" % (total_time / nsamples))
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Always pick n_best opt.n_best = opt.beam_size translator = onmt.Translator(opt) outF = open(opt.output, 'w') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 tgtF = open(opt.tgt) if opt.tgt else None if opt.dump_beam != "": import json translator.initBeamAccum() for line in addone(open(opt.src)): if line is not None: srcTokens = line.split() srcBatch += [srcTokens] if tgtF: tgtTokens = tgtF.readline().split() if tgtF else None tgtBatch += [tgtTokens] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break predBatch, predScore, goldScore = translator.translate( srcBatch, tgtBatch) if opt.normalize: predBatch_ = [] predScore_ = [] for bb, ss in zip(predBatch, predScore): ss_ = [ s_ / numpy.maximum(1., len(b_)) for b_, s_ in zip(bb, ss) ] sidx = numpy.argsort(ss_)[::-1] predBatch_.append([bb[s] for s in sidx]) predScore_.append([ss_[s] for s in sidx]) predBatch = predBatch_ predScore = predScore_ predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): # Pred Batch always have n-best outputs #~ scores = torch.Tensor(len(predBatch[b])) #~ for n in range(opt.n_best): #~ scores[n] = predScore[b][n] #~ if opt.normalize: #~ scores[n] = scores[n] / ( len(predBatch[b][n]) + 1) #~ #~ sorted_scores, sorted_index = torch.sort(scores, 0, True) #~ bestSent = predBatch[b][sorted_index[0]] #~ bestIndex = sorted_index[0] count += 1 # Best sentence = having highest log prob if not opt.print_nbest: outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() else: for n in range(opt.n_best): idx = n #~ if opt.verbose: print("%d ||| %s ||| %.6f" % (count - 1, " ".join( predBatch[b][idx]), predScore[b][idx])) outF.write("%d ||| %s ||| %.6f\n" % (count - 1, " ".join( predBatch[b][idx]), predScore[b][idx])) outF.flush() if opt.verbose: srcSent = ' '.join(srcBatch[b]) if translator.tgt_dict.lower: srcSent = srcSent.lower() print('SENT %d: %s' % (count, srcSent)) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: tgtSent = ' '.join(tgtBatch[b]) if translator.tgt_dict.lower: tgtSent = tgtSent.lower() print('GOLD %d: %s ' % (count, tgtSent)) print("GOLD SCORE: %.4f" % goldScore[b]) print('') srcBatch, tgtBatch = [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if tgtF: tgtF.close() if opt.dump_beam: json.dump(translator.beam_accum, open(opt.dump_beam, 'w'))
return predictions @app.route('/translate', methods=['POST']) def config(): req = request.get_json() res = [] for s in req: res.append(translate(s)) return jsonify(sum(res, [])) if __name__ == '__main__': opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) for model in opt.model: print("Loading model... " + model) modelopt = copy.copy(opt) modelopt.model = model hash = hash_byname(model) translators[hash] = onmt.Translator(modelopt, dummy_opt.__dict__) app.run(debug=False, host='0.0.0.0', port=8092)
def main(arg_list=None): opt = parse_arg(arg_list) if opt.task == "simp": opt.src = "../../data_%s/%s/test/test.normal" % (opt.task, opt.data) opt.tgt = "../../data_%s/%s/test/test.simple.0" % (opt.task, opt.data) elif opt.task == "MT": #opt.src = "../../data_%s/%s/test.de-en.de" % (opt.task, opt.data) #opt.tgt = "../../data_%s/%s/test.de-en.en" % (opt.task, opt.data) opt.src = "../../data_%s/%s/test.en-zh.en" % (opt.task, opt.data) opt.tgt = "../../data_%s/%s/test.en-zh.zh" % (opt.task, opt.data) elif opt.task == "Multi-MT": line = opt.language_pair.split("-") S_lang = line[0] T_lang = line[1] opt.src = "../../data_%s/%s/%s.%s.%s" % ( opt.task, opt.data, opt.test_set, opt.language_pair, S_lang) opt.tgt = "../../data_%s/%s/%s.%s.%s" % ( opt.task, opt.data, opt.test_set, opt.language_pair, T_lang) else: assert False if opt.output is None: opt.output = os.path.dirname(opt.model) + "/" + "test.txt" opt.gpu = opt.gpus opt.cuda = opt.gpu > -1 torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = open(opt.output, 'w') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 tgtF = open(opt.tgt) if opt.tgt else None src_contents = open(opt.src).readlines() representations = [] src_rb_list = [] for line_num, line in enumerate(src_contents): srcTokens = line.split() srcBatch += [srcTokens] if tgtF: tgtTokens = tgtF.readline().split() if tgtF else None tgtBatch += [tgtTokens] if line_num < len(src_contents) - 1 and len(srcBatch) < opt.batch_size: continue predBatch, predScore, goldScore, rep, src_rb = translator.translate( srcBatch, tgtBatch) representations.append(rep) src_rb_list.append(src_rb) '''predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch)''' for b in range(len(predBatch)): count += 1 pred_sent = " ".join(predBatch[b][0]) if opt.bpe: pred_sent = pred_sent.replace("@@ ", "") outF.write(pred_sent + '\n') if opt.verbose: print('SENT %d: %s' % (count, " ".join(srcBatch[b]))) print('PRED %d: %s' % (count, " ".join(predBatch[b][0]))) if opt.bpe: print('PRED CON %d: %s' % (count, pred_sent)) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: print('GOLD %d: %s ' % (count, " ".join(tgtBatch[b]))) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][0]))) print('') predScoreTotal += predScore[b][0] predWordsTotal += len(predBatch[b][0]) if tgtF is not None: goldScoreTotal += goldScore[b] goldWordsTotal += len(tgtBatch[b]) '''reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal)''' srcBatch, tgtBatch = [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if tgtF: tgtF.close() outF.close() if opt.output_representation: save_data = { "representations": torch.cat(representations), "src_rb": torch.cat(src_rb_list) } torch.save(save_data, opt.output_representation)
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt) outF = codecs.open(opt.output, 'w', 'utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 tgtF = codecs.open(opt.tgt, 'r', 'utf-8') if opt.tgt else None if opt.dump_beam != "": import json translator.initBeamAccum() for line in addone(codecs.open(opt.src, 'r', 'utf-8')): if line is not None: srcTokens = line.split() srcBatch += [srcTokens] if tgtF: tgtTokens = tgtF.readline().split() if tgtF else None tgtBatch += [tgtTokens] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break print(srcBatch) print(tgtBatch) predBatch, predScore, goldScore, encStates = translator.translate( srcBatch, tgtBatch) print(predBatch[0][0][1:-1]) # print(encStates[-1][0]) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if tgtF is not None: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if opt.verbose: srcSent = ' '.join(srcBatch[b]) if translator.tgt_dict.lower: srcSent = srcSent.lower() os.write(1, bytes('SENT %d: %s\n' % (count, srcSent), 'UTF-8')) os.write( 1, bytes('PRED %d: %s\n' % (count, " ".join(predBatch[b][0])), 'UTF-8')) print("PRED SCORE: %.4f" % predScore[b][0]) if tgtF is not None: tgtSent = ' '.join(tgtBatch[b]) if translator.tgt_dict.lower: tgtSent = tgtSent.lower() os.write( 1, bytes('GOLD %d: %s\n' % (count, tgtSent), 'UTF-8')) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): os.write( 1, bytes( "[%.4f] %s\n" % (predScore[b][n], " ".join(predBatch[b][n])), 'UTF-8')) print('') srcBatch, tgtBatch = [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if tgtF: tgtF.close() if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): opt = parser.parse_args() # by me # 更新, 用它原来的默认的即可 # opt.bos_token = onmt.Constants.BERT_CLS_WORD opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Always pick n_best opt.n_best = opt.beam_size if opt.output == "stdout": outF = sys.stdout else: outF = open(opt.output, 'w') pred_score_total, pred_words_total, gold_score_total, gold_words_total = 0, 0, 0, 0 src_batch, tgt_batch = [], [] count = 0 tgtF = open(opt.tgt) if opt.tgt else None # # if opt.dump_beam != "": # import json # translator.initBeamAccum() in_file = None if opt.src == "stdin": in_file = sys.stdin opt.batch_size = 1 elif opt.encoder_type == "audio" and opt.asr_format == "h5": in_file = h5.File(opt.src, 'r') elif opt.encoder_type == "audio" and opt.asr_format == "scp": import kaldiio from kaldiio import ReadHelper audio_data = iter(ReadHelper('scp:' + opt.src)) else: in_file = open(opt.src) if not opt.fast_translate: translator = onmt.Translator(opt) else: from onmt.inference.FastTranslator import FastTranslator translator = FastTranslator(opt) # Audio processing for the source batch if opt.encoder_type == "audio": s_prev_context = [] t_prev_context = [] i = 0 while True: if opt.asr_format == "h5": if i == len(in_file): break line = np.array(in_file[str(i)]) i += 1 elif opt.asr_format == "scp": try: _, line = next(audio_data) except StopIteration: break if opt.stride != 1: line = line[0::opt.stride] line = torch.from_numpy(line) if opt.concat != 1: add = (opt.concat - line.size()[0] % opt.concat) % opt.concat z = torch.FloatTensor(add, line.size()[1]).zero_() line = torch.cat((line, z), 0) line = line.reshape((line.size()[0] // opt.concat, line.size()[1] * opt.concat)) if opt.previous_context > 0: s_prev_context.append(line) for i in range(1, opt.previous_context + 1): if i < len(s_prev_context): line = torch.cat((torch.cat( (s_prev_context[-i - 1], torch.zeros(1, line.size()[1]))), line)) if len(s_prev_context) > opt.previous_context: s_prev_context = s_prev_context[-1 * opt.previous_context:] src_batch += [line] if tgtF: # ~ tgt_tokens = tgtF.readline().split() if tgtF else None tline = tgtF.readline().strip() if opt.previous_context > 0: t_prev_context.append(tline) for i in range(1, opt.previous_context + 1): if i < len(s_prev_context): tline = t_prev_context[-i - 1] + " # " + tline if len(t_prev_context) > opt.previous_context: t_prev_context = t_prev_context[-1 * opt.previous_context:] if opt.input_type == 'word': tgt_tokens = tline.split() if tgtF else None elif opt.input_type == 'char': tgt_tokens = list(tline.strip()) if tgtF else None else: raise NotImplementedError("Input type unknown") tgt_batch += [tgt_tokens] if len(src_batch) < opt.batch_size: continue print("Batch size:", len(src_batch), len(tgt_batch)) pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores = translator.translate( src_batch, tgt_batch, type='asr') print("Result:", len(pred_batch)) count, pred_score, pred_words, gold_score, goldWords = translateBatch( opt, tgtF, count, outF, translator, src_batch, tgt_batch, pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores, opt.input_type) pred_score_total += pred_score pred_words_total += pred_words gold_score_total += gold_score gold_words_total += goldWords src_batch, tgt_batch = [], [] # catch the last batch if len(src_batch) != 0: print("Batch size:", len(src_batch), len(tgt_batch)) pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores = translator.translate( src_batch, tgt_batch, type='asr') print("Result:", len(pred_batch)) count, pred_score, pred_words, gold_score, goldWords = translateBatch( opt, tgtF, count, outF, translator, src_batch, tgt_batch, pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores, opt.input_type) pred_score_total += pred_score pred_words_total += pred_words gold_score_total += gold_score gold_words_total += goldWords src_batch, tgt_batch = [], [] # Text processing else: # addone 这里设置为可迭代, 因为我们设置batch_size 的长度 for line in addone(in_file): if line is not None: if opt.input_type == 'word': src_tokens = line.split() elif opt.input_type == 'char': src_tokens = list(line.strip()) else: raise NotImplementedError("Input type unknown") src_batch += [src_tokens] # tgtF:None if tgtF: # ~ tgt_tokens = tgtF.readline().split() if tgtF else None if opt.input_type == 'word': tgt_tokens = tgtF.readline().split() if tgtF else None elif opt.input_type == 'char': tgt_tokens = list( tgtF.readline().strip()) if tgtF else None else: raise NotImplementedError("Input type unknown") tgt_batch += [tgt_tokens] if len(src_batch) < opt.batch_size: continue else: # at the end of file, check last batch # 我们的文件 src_batch 是一个list,正常len不为0 if len(src_batch) == 0: break # actually done beam search from the model pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores = translator.translate( src_batch, tgt_batch) # convert output tensor to words count, pred_score, pred_words, gold_score, goldWords = translateBatch( opt, tgtF, count, outF, translator, src_batch, tgt_batch, pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores, opt.input_type) pred_score_total += pred_score pred_words_total += pred_words gold_score_total += gold_score gold_words_total += goldWords src_batch, tgt_batch = [], [] if opt.verbose: reportScore('PRED', pred_score_total, pred_words_total) if tgtF: reportScore('GOLD', gold_score_total, gold_words_total) if tgtF: tgtF.close() if opt.dump_beam: json.dump(translator.beam_accum, open(opt.dump_beam, 'w'))
def main(): previous_words = None dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] print('dummy_opt: ', dummy_opt) opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 if opt.dump_beam != "": import json translator.initBeamAccum() data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, use_filter_pred=False) test_data = onmt.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) counter = count(1) for batch in test_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src \ = translator.translate(batch, data) pred_score_total += sum(score[0] for score in pred_scores) pred_words_total += sum(len(x[0]) for x in pred_batch) if opt.tgt: gold_score_total += sum(gold_scores) gold_words_total += sum(len(x) for x in batch.tgt[1:]) #davidstap #_, src_lengths = batch.src #encStates, context = translator.model.encoder(src, src_lengths) # z_batch: an iterator over the predictions, their scores, # the gold sentence, its score, and the source sentence for each # sentence in the batch. It has to be zip_longest instead of # plain-old zip because the gold_batch has length 0 if the target # is not included. z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores, (sent.squeeze(1) for sent in src.split(1, dim=1))) for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch: # src_sent is torch.LongTensor #print('type src_sent:',type(src_sent)) n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) words = get_src_words(src_sent, translator.fields["src"].vocab.itos) if previous_words is not None: print('BLEU: ', sentence_bleu([words], previous_words)) print() print('S1:', words) print('S2:', previous_words) #os.write(1, bytes('\nSENT %d: %s\n' % # (sent_number, words), 'UTF-8')) previous_words = words best_pred = n_best_preds[0] #TODO: calculate BLEU score reference (best_pred) and hypothesis (words) #TODO: calculate cosine_similarity (best_pred) and hypothesis (words) #bleu_score = sentence_bleu(best_pred, words) #print('BLEU: ',bleu_score) best_score = pred_score[0] #os.write(1, bytes('PRED %d: %s\n' % # (sent_number, best_pred), 'UTF-8')) #print("PRED SCORE: %.4f" % best_score) # 'words' = input sentence # 'best_pred' = prediction # put source sentence in translator.model.encoder to find context # maybe change data type src? torchtext datatype? #model = NMTModel(encoder, decoder) (see ModelConstructor) src_lengths = len(words.split()) # src(FloatTensor): a sequence of source tensors with # optional feature tensors of size (len x batch). # tgt(FloatTensor): a sequence of target tensors with # optional feature tensors of size (len x batch). # lengths([int]): an array of the src length. # dec_state: A decoder state object #hidden, context = translator.model.encoder(src_sent, src_lengths) #euc_dist(context_r, context_pred) if opt.tgt: tgt_sent = ' '.join(gold_sent) os.write( 1, bytes('GOLD %d: %s\n' % (sent_number, tgt_sent), 'UTF-8')) print("GOLD SCORE: %.4f" % gold_score) if len(n_best_preds) > 1: print('\nBEST HYP:') for score, sent in zip(pred_score, n_best_preds): os.write(1, bytes("[%.4f] %s\n" % (score, sent), 'UTF-8')) report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: report_score('GOLD', gold_score_total, gold_words_total) if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) outF = codecs.open(opt.output, 'w', 'utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] count = 0 if opt.dump_beam != "": import json translator.initBeamAccum() data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, None) testData = onmt.IO.OrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) index = 0 for batch in testData: predBatch, predScore, goldScore, attn, src \ = translator.translate(batch, data) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if opt.tgt: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in tgtBatch) for b in range(len(predBatch)): count += 1 try: # python2 outF.write(" ".join([i.decode('utf-8') for i in predBatch[b][0]]) + '\n') except AttributeError: # python3: can't do .decode on a str object outF.write(" ".join(predBatch[b][0]) + '\n') outF.flush() if opt.verbose: words = [] for f in src[:, b]: word = translator.fields["src"].vocab.itos[f] if word == onmt.IO.PAD_WORD: break words.append(word) os.write(1, bytes('SENT %d: %s\n' % (count, " ".join(words)), 'UTF-8')) index += 1 print(len(predBatch[b][0])) os.write(1, bytes('\n PRED %d: %s\n' % (count, " ".join(predBatch[b][0])), 'UTF-8')) print("PRED SCORE: %.4f" % predScore[b][0]) if opt.tgt: tgtSent = ' '.join(tgtBatch[b]) os.write(1, bytes('GOLD %d: %s\n' % (count, tgtSent), 'UTF-8')) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): os.write(1, bytes("[%.4f] %s\n" % (predScore[b][n], " ".join(predBatch[b][n])), 'UTF-8')) if opt.attn_debug: print('') for i, w in enumerate(predBatch[b][0]): print(w) _, ids = attn[b][0][i].sort(0, descending=True) for j in ids[:5].tolist(): print("\t%s\t%d\t%3f" % (srcBatch[b][j], j, attn[b][0][i][j])) srcBatch, tgtBatch = [], [] reportScore('PRED', predScoreTotal, predWordsTotal) if opt.tgt: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main(): dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') pred_score_total, pred_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0 if opt.dump_beam != "": import json translator.initBeamAccum() data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, use_filter_pred=False) test_data = onmt.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) counter = count(1) for batch in test_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src \ = translator.translate(batch, data) pred_score_total += sum(score[0] for score in pred_scores) pred_words_total += sum(len(x[0]) for x in pred_batch) if opt.tgt: gold_score_total += sum(gold_scores) gold_words_total += sum(len(x) for x in batch.tgt[1:]) # z_batch: an iterator over the predictions, their scores, # the gold sentence, its score, and the source sentence for each # sentence in the batch. It has to be zip_longest instead of # plain-old zip because the gold_batch has length 0 if the target # is not included. z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores, (sent.squeeze(1) for sent in src.split(1, dim=1))) for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch: n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() if opt.verbose: sent_number = next(counter) words = get_src_words(src_sent, translator.fields["src"].vocab.itos) os.write( 1, bytes('\nSENT %d: %s\n' % (sent_number, words), 'UTF-8')) best_pred = n_best_preds[0] best_score = pred_score[0] os.write( 1, bytes('PRED %d: %s\n' % (sent_number, best_pred), 'UTF-8')) print("PRED SCORE: %.4f" % best_score) if opt.tgt: tgt_sent = ' '.join(gold_sent) os.write( 1, bytes('GOLD %d: %s\n' % (sent_number, tgt_sent), 'UTF-8')) print("GOLD SCORE: %.4f" % gold_score) if len(n_best_preds) > 1: print('\nBEST HYP:') for score, sent in zip(pred_score, n_best_preds): os.write(1, bytes("[%.4f] %s\n" % (score, sent), 'UTF-8')) report_score('PRED', pred_score_total, pred_words_total) if opt.tgt: report_score('GOLD', gold_score_total, gold_words_total) if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))
def __init__(self, model): opt = TranslatorParameter(model) self.translator = onmt.Translator(opt)
def translate(src, model, output): parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.translate_opts(parser) opt = parser.parse_known_args([])[0] if opt.batch_size != 1: print("WARNING: -batch_size isn't supported currently, " "we set it to 1 for now!") opt.batch_size = 1 opt.src = src opt.model = model opt.output = output dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') gold_out_file = codecs.open("gold_" + opt.output, 'w', 'utf-8') #print "TRANSLATOR SOURCE VOCAB" #for i in range(len(translator.fields["src"].vocab.itos)): # print i, translator.fields["src"].vocab.itos[i] #print data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, use_filter_pred=False) test_data = onmt.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) counter = count(1) for batch in test_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src \ = translator.translate(batch, data) # z_batch: an iterator over the predictions, their scores, # the gold sentence, its score, and the source sentence for each # sentence in the batch. It has to be zip_longest instead of # plain-old zip because the gold_batch has length 0 if the target # is not included. z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores, (sent.squeeze(1) for sent in src.split(1, dim=1))) for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch: n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() words = get_src_words(src_sent, translator.fields["src"].vocab.itos) #print words gold_out_file.write(words) gold_out_file.write('\n') gold_out_file.flush()
def main(): opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) #Creates the translator!!! translator = onmt.Translator(opt, dummy_opt.__dict__) outF = codecs.open(opt.output, 'w', 'utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 count = 0 if opt.dump_beam != "": import json translator.initBeamAccum() #Process the data for the test data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, None, opt.inter) testData = onmt.IO.OrderedIterator( dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) if opt.inter != None: inter_act=True else: inter_act=False index = 0 for batch in testData: #I will need to change the translator! predBatch, goldBatch, predScore, goldScore, attn, src \ = translator.translate(batch, data, inter_act) #print((attn[0][0])) #print (predBatch) if opt.save_attention: attn_numpy=attn[0][0].numpy() #print(attn_numpy.T.shape) pickle.dump(attn_numpy.T,open('attention_matrix.pkl','wb')) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) if opt.tgt: goldScoreTotal += sum(goldScore) goldWordsTotal += sum(len(x) for x in batch.tgt[1:]) for b in range(len(predBatch)): count += 1 try: # python2 (should be the same) for n in range(opt.n_best): outF.write(" ".join([i for i in predBatch[b][n]]) + '\n') except AttributeError: # python3: can't do .decode on a str object for n in range(opt.n_best): outF.write(" ".join(predBatch[b][n]) + '\n') outF.flush() if opt.verbose: words = [] for f in src[:, b]: word = translator.fields["src"].vocab.itos[f] if word == onmt.IO.PAD_WORD: break words.append(word) os.write(1, bytes('\nSENT %d: %s\n' % (count, " ".join(words)), 'UTF-8')) index += 1 os.write(1, bytes('PRED %d: %s\n' % (count, " ".join(predBatch[b][0])), 'UTF-8')) print("PRED SCORE: %.4f" % predScore[b][0]) if opt.tgt: tgtSent = ' '.join(goldBatch[b]) os.write(1, bytes('GOLD %d: %s\n' % (count, tgtSent), 'UTF-8')) print("GOLD SCORE: %.4f" % goldScore[b]) if opt.n_best > 1: print('\nBEST HYP:') for n in range(opt.n_best): os.write(1, bytes("[%.4f] %s\n" % (predScore[b][n], " ".join(predBatch[b][n])), 'UTF-8')) reportScore('PRED', predScoreTotal, predWordsTotal) if opt.tgt: reportScore('GOLD', goldScoreTotal, goldWordsTotal) if opt.dump_beam: json.dump(translator.beam_accum, codecs.open(opt.dump_beam, 'w', 'utf-8'))