def test_build_vocab_idx(): inst_file = '/home/peng/Workspace/data/multi30k/train.en' min_word_count = 2 words_inst = read_instances_from_file(inst_file, 30, False) words_inst = [w for w in words_inst if w] word2idx = build_vocab_idx(words_inst, min_word_count) print(word2idx)
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument( '-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument( '-vocab', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_data = DataLoader(preprocess_data['dict']['src'], preprocess_data['dict']['tgt'], src_insts=test_src_insts, cuda=opt.cuda, shuffle=False, batch_size=opt.batch_size) translator = Translator(opt) translator.model.eval() with open(opt.output, 'w') as f: for batch in tqdm(test_data, mininterval=2, desc=' - (Test)', leave=False): all_hyp, all_scores = translator.translate_batch(batch) for idx_seqs in all_hyp: for idx_seq in idx_seqs: pred_line = ' '.join( [test_data.tgt_idx2word[idx] for idx in idx_seq]) f.write(pred_line + '\n') print('[Info] Finished.')
def test_read_instances_from_file(): inst_file = '/home/peng/Workspace/data/multi30k/train.en' max_sent_len = 40 keep_case = True words_inst = read_instances_from_file(inst_file, max_sent_len, keep_case) for sent in words_inst: print(sent)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument( '-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument( '-vocab', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') # 有动作就设置为true opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_loader = torch.utils.data.DataLoader(TranslationDataset( src_word2idx=preprocess_data['dict']['src'], tgt_word2idx=preprocess_data['dict']['tgt'], src_insts=test_src_insts), num_workers=2, batch_size=opt.batch_size, collate_fn=collate_fn) translator = Translator(opt) with open(opt.output, 'w') as f: for batch in tqdm(test_loader, mininterval=2, desc=' - (Test)', leave=False): all_hyp, all_scores = translator.translate_batch(*batch) for hyp_stream in all_hyp: for hyp in hyp_stream: pred_sent = ' '.join( [test_loader.dataset.tgt_idx2word[idx] for idx in hyp]) f.write(pred_sent + '\n') print('[Info] Finished')
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument('-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-vocab', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_data = DataLoader( preprocess_data['dict']['src'], preprocess_data['dict']['tgt'], src_insts=test_src_insts, cuda=opt.cuda, shuffle=False, batch_size=opt.batch_size) translator = Translator(opt) translator.model.eval() with open(opt.output, 'w') as f: for batch in tqdm(test_data, mininterval=2, desc=' - (Test)', leave=False): all_hyp, all_scores = translator.translate_batch(batch) for idx_seqs in all_hyp: for idx_seq in idx_seqs: pred_line = ' '.join([test_data.tgt_idx2word[idx] for idx in idx_seq]) f.write(pred_line + '\n') print('[Info] Finished.')
def main(): """Main Function""" parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument('-src', required=True, help='Source sequence to decode ' '(one line per sequence)') parser.add_argument('-tgt', required=True, help='Target sequence to decode ' '(one line per sequence)') parser.add_argument('-vocab', required=True, help='Source sequence to decode ' '(one line per sequence)') parser.add_argument('-log', default='translate_log.txt', help="""Path to log the translation(test_inference) loss""") parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=2, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_tgt_word_insts = read_instances_from_file( opt.tgt, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_tgt_insts = convert_instance_to_idx_seq( test_tgt_word_insts, preprocess_data['dict']['tgt']) test_loader = torch.utils.data.DataLoader(TranslationDataset( src_word2idx=preprocess_data['dict']['src'], tgt_word2idx=preprocess_data['dict']['tgt'], src_insts=test_src_insts, tgt_insts=test_tgt_insts), num_workers=2, batch_size=opt.batch_size, collate_fn=paired_collate_fn) translator = Translator(opt) n_word_total = 0 n_word_correct = 0 with open(opt.output, 'w') as f: for batch in tqdm(test_loader, mininterval=2, desc=' - (Test)', leave=False): # all_hyp, all_scores = translator.translate_batch(*batch) all_hyp, all_scores = translator.translate_batch( batch[0], batch[1]) # print(all_hyp) # print(all_hyp[0]) # print(len(all_hyp[0])) # pad with 0's fit to max_len in insts_group src_seqs = batch[0] # print(src_seqs.shape) tgt_seqs = batch[2] # print(tgt_seqs.shape) gold = tgt_seqs[:, 1:] # print(gold.shape) max_len = gold.shape[1] pred_seq = [] for item in all_hyp: curr_item = item[0] curr_len = len(curr_item) # print(curr_len, max_len) # print(curr_len) if curr_len < max_len: diff = max_len - curr_len curr_item.extend([0] * diff) else: # TODO: why does this case happen? curr_item = curr_item[:max_len] pred_seq.append(curr_item) pred_seq = torch.LongTensor(np.array(pred_seq)) pred_seq = pred_seq.view(opt.batch_size * max_len) n_correct = cal_performance(pred_seq, gold) non_pad_mask = gold.ne(Constants.PAD) n_word = non_pad_mask.sum().item() n_word_total += n_word n_word_correct += n_correct # trs_log = "transformer_loss: {} |".format(trs_loss) # # with open(opt.log, 'a') as log_tf: # log_tf.write(trs_log + '\n') count = 0 for pred_seqs in all_hyp: src_seq = src_seqs[count] tgt_seq = tgt_seqs[count] for pred_seq in pred_seqs: src_line = ' '.join([ test_loader.dataset.src_idx2word[idx] for idx in src_seq.data.cpu().numpy() ]) tgt_line = ' '.join([ test_loader.dataset.tgt_idx2word[idx] for idx in tgt_seq.data.cpu().numpy() ]) pred_line = ' '.join([ test_loader.dataset.tgt_idx2word[idx] for idx in pred_seq ]) f.write( "\n ---------------------------------------------------------------------------------------------------------------------------------------------- \n" ) f.write("\n [src] " + src_line + '\n') f.write("\n [tgt] " + tgt_line + '\n') f.write("\n [pred] " + pred_line + '\n') count += 1 accuracy = n_word_correct / n_word_total accr_log = "accuracy: {} |".format(accuracy) # print(accr_log) with open(opt.log, 'a') as log_tf: log_tf.write(accr_log + '\n') print('[Info] Finished.')
def main(): """Main Function""" parser = argparse.ArgumentParser(description="translate.py") parser.add_argument("-model", required=True, help="Path to model .pt file") parser.add_argument( "-src", required=True, help="Source sequence to decode (one line per sequence)") parser.add_argument( "-vocab", required=True, help="Source sequence to decode (one line per sequence)") parser.add_argument("-output", default="pred.txt", help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument("-beam_size", type=int, default=5, help="Beam size") parser.add_argument("-batch_size", type=int, default=30, help="Batch size") parser.add_argument("-n_best", type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument("-no_cuda", action="store_true") opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data["settings"] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data["dict"]["src"]) test_loader = torch.utils.data.DataLoader(TranslationDataset( src_word2idx=preprocess_data["dict"]["src"], tgt_word2idx=preprocess_data["dict"]["tgt"], src_insts=test_src_insts), num_workers=2, batch_size=opt.batch_size, collate_fn=collate_fn) translator = Translator(opt) with open(opt.output, "w") as f: for batch in tqdm(test_loader, mininterval=2, desc=" - (Test)", leave=False): all_hyp, all_scores = translator.translate_batch(*batch) for idx_seqs in all_hyp: for idx_seq in idx_seqs: pred_line = " ".join([ test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq ]) f.write(pred_line + "\n") print("[Info] Finished.")
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument( '-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-vocab', required=True, help='preprocess file to provide vocabulary') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-lambda_1', type=float, default=2 / 3, help='diversity factor for hamming diversity') parser.add_argument('-lambda_2', type=float, default=2 / 3, help='diversity factor for bi-gram diversity') parser.add_argument('-lambda_3', type=float, default=2 / 3, help='diversity factor for tri-gram diversity') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_data = DataLoader(preprocess_data['dict']['src'], preprocess_data['dict']['tgt'], src_insts=test_src_insts, cuda=opt.cuda, shuffle=False, batch_size=opt.batch_size) translator = Translator_idbs(opt) translator.model.eval() print('[Info] Start translating...') f = open(opt.output, 'w') for batch in tqdm(test_data, mininterval=2, desc=' - (Test)', leave=False): all_hyp = translator.translate_batch(batch) for idx_seq in all_hyp: pred_line = ' '.join( [test_data.tgt_idx2word[idx] for idx in idx_seq]) #转化成单词拼接起来 f.write(pred_line + '\n') f.flush() f.close() print('[Info] Finished.')
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument( '-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument( '-target', required=True, help='Target sequence to decode (one line per sequence)') parser.add_argument( '-vocab', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-prune', action='store_true') parser.add_argument('-prune_alpha', type=float, default=0.1) parser.add_argument('-load_mask', type=str, default=None) opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] refs = read_instances_from_file(opt.target, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_loader = torch.utils.data.DataLoader(TranslationDataset( src_word2idx=preprocess_data['dict']['src'], tgt_word2idx=preprocess_data['dict']['tgt'], src_insts=test_src_insts, ), num_workers=2, batch_size=opt.batch_size, collate_fn=collate_fn) translator = Translator(opt) preds = [] preds_text = [] for batch in tqdm(test_loader, mininterval=2, desc=' - (Test)', leave=False): all_hyp, all_scores = translator.translate_batch(*batch) for idx_seqs in all_hyp: for idx_seq in idx_seqs: sent = ' '.join( [test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq]) sent = sent.split("</s>")[0].strip() sent = sent.replace("▁", " ") preds_text.append(sent.strip()) preds.append( [test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq]) with open(opt.output, 'w') as f: f.write('\n'.join(preds_text)) from evaluator import BLEUEvaluator scorer = BLEUEvaluator() length = min(len(preds), len(refs)) score = scorer.evaluate(refs[:length], preds[:length]) print(score)
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='sum_file.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument('-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-vocab', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case, preprocess_settings.mode) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) # prepare model device = torch.device('cuda' if opt.cuda else 'cpu') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] model_opt.bidirectional = True encoder = EncoderRNN(model_opt.src_vocab_size, model_opt.max_token_seq_len, model_opt.d_model, bidirectional=model_opt.bidirectional, variable_lengths=True) decoder = DecoderRNN(model_opt.tgt_vocab_size, model_opt.max_token_seq_len, model_opt.d_model * 2 if model_opt.bidirectional else model_opt.d_model, n_layers=model_opt.n_layer, dropout_p=model_opt.dropout, use_attention=True, bidirectional=model_opt.bidirectional, eos_id=Constants.BOS, sos_id=Constants.EOS) model = Seq2seq(encoder, decoder).to(device) model = nn.DataParallel(model) # using Dataparallel because training used model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') predictor = Predictor(model, preprocess_data['dict']['tgt']) with open(opt.output, 'w') as f: for src_seq in tqdm(test_src_insts, mininterval=2, desc=' - (Test)', leave=False): pred_line = ' '.join(predictor.predict(src_seq)) f.write(pred_line + '\n') print('[Info] Finished.')
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument( '-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument( '-vocab', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) # pdb.set_trace() # (Pdb) print(opt) # Namespace(batch_size=30, beam_size=5, cuda=True, model='trained.chkpt', # n_best=1, no_cuda=False, output='pred.txt', src='data/multi30k/test.en.atok', # vocab='data/multi30k.atok.low.pt') test_loader = torch.utils.data.DataLoader(TranslationDataset( src_word2idx=preprocess_data['dict']['src'], tgt_word2idx=preprocess_data['dict']['tgt'], src_insts=test_src_insts), num_workers=2, batch_size=opt.batch_size, collate_fn=collate_fn) translator = Translator(opt) with open(opt.output, 'w') as f: for batch in tqdm(test_loader, mininterval=2, desc=' - (Test)', leave=False): all_hyp, all_scores = translator.translate_batch(*batch) for idx_seqs in all_hyp: for idx_seq in idx_seqs: pred_line = ' '.join([ test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq ]) f.write(pred_line + '\n') print('[Info] Finished.')
def prep(train_src, train_tgt, valid_src, valid_tgt, save_data, max_word_seq_len=50, min_word_count=5, keep_case=True, share_vocab=True, vocab=None): max_token_seq_len = max_word_seq_len + 2 # include the <s> and </s> # opt = Settings(train_src, train_tgt, valid_src, valid_tgt, save_data, max_word_seq_len, min_word_count, keep_case, share_vocab, vocab) # # Training set # train_src_word_insts = prepro.read_instances_from_file( # train_src, max_word_seq_len, keep_case) # train_tgt_word_insts = prepro.read_instances_from_file( # train_tgt, max_word_seq_len, keep_case) # if len(train_src_word_insts) != len(train_tgt_word_insts): # print('[Warning] The training instance count is not equal.') # min_inst_count = min(len(train_src_word_insts), len(train_tgt_word_insts)) # train_src_word_insts = train_src_word_insts[:min_inst_count] # train_tgt_word_insts = train_tgt_word_insts[:min_inst_count] # #- Remove empty instances # train_src_word_insts, train_tgt_word_insts = list(zip(*[ # (s, t) for s, t in zip(train_src_word_insts, train_tgt_word_insts) if s and t])) # # Validation set # valid_src_word_insts = prepro.read_instances_from_file( # valid_src, max_word_seq_len, keep_case) # valid_tgt_word_insts = prepro.read_instances_from_file( # valid_tgt, max_word_seq_len, keep_case) # if len(valid_src_word_insts) != len(valid_tgt_word_insts): # print('[Warning] The validation instance count is not equal.') # min_inst_count = min(len(valid_src_word_insts), len(valid_tgt_word_insts)) # valid_src_word_insts = valid_src_word_insts[:min_inst_count] # valid_tgt_word_insts = valid_tgt_word_insts[:min_inst_count] # #- Remove empty instances # valid_src_word_insts, valid_tgt_word_insts = list(zip(*[ # (s, t) for s, t in zip(valid_src_word_insts, valid_tgt_word_insts) if s and t])) src_word_insts = prepro.read_instances_from_file(train_src, max_word_seq_len, keep_case) tgt_word_insts = prepro.read_instances_from_file(train_tgt, max_word_seq_len, keep_case) if len(train_src_word_insts) != len(train_tgt_word_insts): print('[Warning] The training instance count is not equal.') min_inst_count = min(len(train_src_word_insts), len(train_tgt_word_insts)) train_src_word_insts = train_src_word_insts[:min_inst_count] train_tgt_word_insts = train_tgt_word_insts[:min_inst_count] #- Remove empty instances train_src_word_insts, train_tgt_word_insts = list( zip(*[(s, t) for s, t in zip(train_src_word_insts, train_tgt_word_insts) if s and t])) # Build vocabulary if vocab: predefined_data = torch.load(vocab) assert 'dict' in predefined_data print('[Info] Pre-defined vocabulary found.') src_word2idx = predefined_data['dict']['src'] tgt_word2idx = predefined_data['dict']['tgt'] else: if share_vocab: print('[Info] Build shared vocabulary for source and target.') word2idx = prepro.build_vocab_idx( train_src_word_insts + train_tgt_word_insts, min_word_count) src_word2idx = tgt_word2idx = word2idx else: print('[Info] Build vocabulary for source.') src_word2idx = prepro.build_vocab_idx(train_src_word_insts, min_word_count) print('[Info] Build vocabulary for target.') tgt_word2idx = prepro.build_vocab_idx(train_tgt_word_insts, min_word_count) # word to index print('[Info] Convert source word instances into sequences of word index.') train_src_insts = prepro.convert_instance_to_idx_seq( train_src_word_insts, src_word2idx) valid_src_insts = prepro.convert_instance_to_idx_seq( valid_src_word_insts, src_word2idx) print('[Info] Convert target word instances into sequences of word index.') train_tgt_insts = prepro.convert_instance_to_idx_seq( train_tgt_word_insts, tgt_word2idx) valid_tgt_insts = prepro.convert_instance_to_idx_seq( valid_tgt_word_insts, tgt_word2idx) data = { 'settings': max_token_seq_len, 'dict': { 'src': src_word2idx, 'tgt': tgt_word2idx }, 'train': { 'src': train_src_insts, 'tgt': train_tgt_insts }, 'valid': { 'src': valid_src_insts, 'tgt': valid_tgt_insts } } print('[Info] Dumping the processed data to pickle file', save_data) torch.save(data, save_data) print('[Info] Finish.')
def main(): '''Main Function''' ''' 这个模型是从英语到德语. ''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', required=False, help='Path to model .pt file') parser.add_argument('-src', required=False, help='Source sequence to decode (one line per sequence)') parser.add_argument('-vocab', required=False, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='2', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') #-vocab data/multi30k.atok.low.pt opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.cuda=False opt.model='trained.chkpt' opt.src='1' opt.vocab='multi30k.atok.low.pt' # Prepare DataLoader preprocess_data = torch.load(opt.vocab) tmp1=preprocess_data['dict']['src'] tmp2=preprocess_data['dict']['tgt'] with open('55','w')as f: f.write(str(tmp1)) with open('66','w',encoding='utf-8')as f: f.write(str(tmp2)) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_loader = torch.utils.data.DataLoader( TranslationDataset( src_word2idx=preprocess_data['dict']['src'], tgt_word2idx=preprocess_data['dict']['tgt'], src_insts=test_src_insts), num_workers=2, batch_size=opt.batch_size, collate_fn=collate_fn) translator = Translator(opt) with open(opt.output, 'w') as f: for batch in test_loader: all_hyp, all_scores = translator.translate_batch(*batch) for idx_seqs in all_hyp: for idx_seq in idx_seqs: print(idx_seq) pred_line = ' '.join([test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq]) # 把id转化会text f.write(pred_line + '\n') print('[Info] Finished.')
parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) test_loader = torch.utils.data.DataLoader(TranslationDataset( src_word2idx=preprocess_data['dict']['src'], tgt_word2idx=preprocess_data['dict']['tgt'], src_insts=test_src_insts), num_workers=2, batch_size=opt.batch_size, collate_fn=collate_fn) translator = Translator(opt) with open(opt.output, 'w') as f:
def main(): '''Main Function''' parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-image_dir', required=True, help='image directory') parser.add_argument('-model', required=True, help='Path to model .pt file') parser.add_argument('-img', required=True, help='Source image to decode (one line per sequence)') parser.add_argument( '-src', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument( '-vocab', required=True, help='Source sequence to decode (one line per sequence)') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-crop_size', type=int, default=224, help='size for randomly cropping images') opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_img_insts = read_instances_from_file( opt.img, preprocess_settings.max_word_seq_len) test_src_word_insts = read_instances_from_file( opt.src, preprocess_settings.max_word_seq_len) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) # Image Preprocessing transform = transforms.Compose([ transforms.RandomResizedCrop(opt.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) test_data = DataLoader(transform, opt.image_dir, preprocess_data['dict']['src'], preprocess_data['dict']['tgt'], image_insts=test_img_insts, src_insts=test_src_insts, cuda=opt.cuda, shuffle=False, batch_size=1) translator = Translator(opt) translator.model.eval() inv_map = {v: k for k, v in preprocess_data['dict']['tgt'].items()} target = open(opt.output, "wb") for batch in tqdm(test_data, mininterval=2, desc=' - (Test)', leave=False): seq = translator.translate_batch(batch) if seq is None: line = "None\n" target.write(line.encode("utf-8")) continue seq = seq[1:-1] line = [inv_map[val] for val in seq] line = " ".join(line) + "\n" target.write(line.encode("utf-8")) target.close() print('[Info] Finished.')
def main(): parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-model', default='trained.chkpt', help='Path to model .pt file') parser.add_argument( '-src', default='data/multi30k/test.en.atok', help='Source sequence to decode (one line per sequence)') parser.add_argument( '-ctx', required=False, default="", help='Context sequence to decode (one line per sequence)') parser.add_argument('-vocab', default='data/multi30k.atok.low.pt', help='Data that contains the source vocabulary') parser.add_argument('-output', default='pred.txt', help="""Path to output the predictions (each line will be the decoded sequence""") parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-batch_size', type=int, default=30, help='Batch size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_false') parser.add_argument('-max_token_seq_len', type=int, default=100) opt = parser.parse_args() opt.cuda = not opt.no_cuda # Prepare DataLoader preprocess_data = torch.load(opt.vocab) preprocess_settings = preprocess_data['settings'] test_src_word_insts = read_instances_from_file( opt.src, opt.max_token_seq_len, preprocess_settings.keep_case) test_src_insts = convert_instance_to_idx_seq( test_src_word_insts, preprocess_data['dict']['src']) if opt.ctx: from preprocess_ctx import read_instances_from_file as read_instances_from_file_ctx test_ctx_word_insts = read_instances_from_file_ctx( opt.ctx, opt.max_token_seq_len, preprocess_settings.keep_case, is_ctx=True) test_ctx_insts = convert_instance_to_idx_seq( test_ctx_word_insts, preprocess_data['dict']['src']) test_data = DataLoader(preprocess_data['dict']['src'], preprocess_data['dict']['tgt'], src_insts=test_src_insts, ctx_insts=(test_ctx_insts if opt.ctx else None), cuda=opt.cuda, shuffle=False, batch_size=opt.batch_size, is_train=False) translator = Translator(opt) translator.model.eval() with open(opt.output, 'w') as f: for batch in tqdm(test_data, mininterval=2, desc=' - (Test)', leave=False): print(---------1111111111) all_hyp, all_scores = translator.translate_batch(*batch) print(---------2222222222) for idx_seqs in all_hyp: for idx_seq in idx_seqs: if idx_seq[-1] == 3: # if last word is EOS idx_seq = idx_seq[:-1] pred_line = ' '.join( [test_data.tgt_idx2word[int(idx)] for idx in idx_seq]) f.write(pred_line + '\n') print("end") print('[Info] Finished.')