def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])
    test_data = DataLoader(preprocess_data['dict']['src'],
                           preprocess_data['dict']['tgt'],
                           src_insts=test_src_insts,
                           cuda=opt.cuda,
                           shuffle=False,
                           batch_size=opt.batch_size)

    translator = Translator(opt)
    translator.model.eval()

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_data,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            all_hyp, all_scores = translator.translate_batch(batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    pred_line = ' '.join(
                        [test_data.tgt_idx2word[idx] for idx in idx_seq])
                    f.write(pred_line + '\n')
    print('[Info] Finished.')
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                            be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                            decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')  # 有动作就设置为true

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)
    translator = Translator(opt)

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            all_hyp, all_scores = translator.translate_batch(*batch)
            for hyp_stream in all_hyp:
                for hyp in hyp_stream:
                    pred_sent = ' '.join(
                        [test_loader.dataset.tgt_idx2word[idx] for idx in hyp])
                    f.write(pred_sent + '\n')
    print('[Info] Finished')
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True,
                        help='Path to model .pt file')
    parser.add_argument('-src', required=True,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-vocab', required=True,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output', default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5,
                        help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30,
                        help='Batch size')
    parser.add_argument('-n_best', type=int, default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src,
        preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])
    test_data = DataLoader(
        preprocess_data['dict']['src'],
        preprocess_data['dict']['tgt'],
        src_insts=test_src_insts,
        cuda=opt.cuda,
        shuffle=False,
        batch_size=opt.batch_size)

    translator = Translator(opt)
    translator.model.eval()

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_data, mininterval=2, desc='  - (Test)', leave=False):
            all_hyp, all_scores = translator.translate_batch(batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    pred_line = ' '.join([test_data.tgt_idx2word[idx] for idx in idx_seq])
                    f.write(pred_line + '\n')
    print('[Info] Finished.')
Ejemplo n.º 4
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-vocab',
                        required=True,
                        help='preprocess file to provide vocabulary')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-lambda_1',
                        type=float,
                        default=2 / 3,
                        help='diversity factor for hamming diversity')
    parser.add_argument('-lambda_2',
                        type=float,
                        default=2 / 3,
                        help='diversity factor for bi-gram diversity')
    parser.add_argument('-lambda_3',
                        type=float,
                        default=2 / 3,
                        help='diversity factor for tri-gram diversity')
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']

    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)

    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_data = DataLoader(preprocess_data['dict']['src'],
                           preprocess_data['dict']['tgt'],
                           src_insts=test_src_insts,
                           cuda=opt.cuda,
                           shuffle=False,
                           batch_size=opt.batch_size)

    translator = Translator_idbs(opt)
    translator.model.eval()

    print('[Info] Start translating...')
    f = open(opt.output, 'w')
    for batch in tqdm(test_data, mininterval=2, desc='  - (Test)',
                      leave=False):
        all_hyp = translator.translate_batch(batch)
        for idx_seq in all_hyp:
            pred_line = ' '.join(
                [test_data.tgt_idx2word[idx] for idx in idx_seq])  #转化成单词拼接起来
            f.write(pred_line + '\n')
            f.flush()
    f.close()
    print('[Info] Finished.')
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument('-data_dir', required=True)
    parser.add_argument('-debug', action='store_true')
    parser.add_argument('-dir_out', default="/home/suster/Apps/out/")
    parser.add_argument(
        "--convert-consts",
        type=str,
        help="conv | our-map | no-our-map | no. \n/"
        "conv-> txt: -; stats: num_sym+ent_sym.\n/"
        "our-map-> txt: num_sym; stats: num_sym(from map)+ent_sym;\n/"
        "no-our-map-> txt: -; stats: num_sym(from map)+ent_sym;\n/"
        "no-> txt: -; stats: -, only ent_sym;\n/"
        "no-ent-> txt: -; stats: -, no ent_sym;\n/")
    parser.add_argument(
        "--label-type-dec",
        type=str,
        default="full-pl",
        help=
        "predicates | predicates-all | predicates-arguments-all | full-pl | full-pl-no-arg-id | full-pl-split | full-pl-split-plc | full-pl-split-stat-dyn. To use with EncDec."
    )
    parser.add_argument('-vocab', required=True)
    #parser.add_argument('-output', default='pred.txt',
    #                    help="""Path to output the predictions (each line will
    #                    be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    args = parser.parse_args()
    args.cuda = not args.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(args.vocab)
    preprocess_settings = preprocess_data['settings']

    if args.convert_consts in {"conv"}:
        assert "nums_mapped" not in args.data_dir
    elif args.convert_consts in {"our-map", "no-our-map", "no", "no-ent"}:
        assert "nums_mapped" in args.data_dir
    else:
        if args.convert_consts is not None:
            raise ValueError
    test_corp = Nlp4plpCorpus(args.data_dir + "test", args.convert_consts)

    if args.debug:
        test_corp.insts = test_corp.insts[:10]
    test_corp.get_labels(label_type=args.label_type_dec)
    test_corp.remove_none_labels()

    # Training set
    test_src_word_insts, test_src_id_insts = prepare_instances(test_corp.insts)
    test_tgt_word_insts, test_tgt_id_insts = prepare_instances(test_corp.insts,
                                                               label=True)
    assert test_src_id_insts == test_tgt_id_insts
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts),
                                              num_workers=0,
                                              batch_size=args.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(args)

    i = 0
    preds = []
    golds = []

    for batch in tqdm(test_loader,
                      mininterval=2,
                      desc='  - (Test)',
                      leave=False):
        all_hyp, all_scores = translator.translate_batch(*batch)
        for idx_seqs in all_hyp:
            for idx_seq in idx_seqs:
                pred = [
                    test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq
                    if test_loader.dataset.tgt_idx2word[idx] != "</s>"
                ]
                gold = [
                    w for w in test_tgt_word_insts[i]
                    if w not in {"<s>", "</s>"}
                ]
                if args.convert_consts == "no":
                    num2n = None
                else:
                    id = test_src_id_insts[i]
                    assert test_corp.insts[i].id == id
                    num2n = test_corp.insts[i].num2n_map
                pred = final_repl(pred, num2n)
                gold = final_repl(gold, num2n)
                preds.append(pred)
                golds.append(gold)
                i += 1
    acc = accuracy_score(golds, preds)
    print(f"Accuracy: {acc:.3f}")
    print("Saving predictions from the best model:")

    assert len(test_src_id_insts) == len(test_src_word_insts) == len(
        preds) == len(golds)
    f_model = f'{datetime.now().strftime("%Y%m%d_%H%M%S_%f")}'
    dir_out = f"{args.dir_out}log_w{f_model}/"
    print(f"Save preds dir: {dir_out}")
    if not os.path.exists(dir_out):
        os.makedirs(dir_out)
    for (id, gold, pred) in zip(test_src_id_insts, golds, preds):
        f_name_t = os.path.basename(f"{id}.pl_t")
        f_name_p = os.path.basename(f"{id}.pl_p")
        with open(dir_out + f_name_t,
                  "w") as f_out_t, open(dir_out + f_name_p, "w") as f_out_p:
            f_out_t.write(gold)
            f_out_p.write(pred)

    #with open(args.output, 'w') as f:
    #   golds
    #    preds
    #    f.write("PRED: " + pred_line + '\n')
    #    f.write("GOLD: " + gold_line + '\n')

    print('[Info] Finished.')
Ejemplo n.º 6
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-target',
        required=True,
        help='Target sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    parser.add_argument('-prune', action='store_true')
    parser.add_argument('-prune_alpha', type=float, default=0.1)
    parser.add_argument('-load_mask', type=str, default=None)

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']

    refs = read_instances_from_file(opt.target,
                                    preprocess_settings.max_word_seq_len,
                                    preprocess_settings.keep_case)

    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts,
    ),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(opt)

    preds = []
    preds_text = []

    for batch in tqdm(test_loader,
                      mininterval=2,
                      desc='  - (Test)',
                      leave=False):
        all_hyp, all_scores = translator.translate_batch(*batch)
        for idx_seqs in all_hyp:
            for idx_seq in idx_seqs:
                sent = ' '.join(
                    [test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq])
                sent = sent.split("</s>")[0].strip()
                sent = sent.replace("▁", " ")
                preds_text.append(sent.strip())
                preds.append(
                    [test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq])
    with open(opt.output, 'w') as f:
        f.write('\n'.join(preds_text))

    from evaluator import BLEUEvaluator
    scorer = BLEUEvaluator()
    length = min(len(preds), len(refs))
    score = scorer.evaluate(refs[:length], preds[:length])
    print(score)
Ejemplo n.º 7
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='sum_file.py')

    parser.add_argument('-model', required=True,
                        help='Path to model .pt file')
    parser.add_argument('-src', required=True,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-vocab', required=True,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output', default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5,
                        help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30,
                        help='Batch size')
    parser.add_argument('-n_best', type=int, default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src,
        preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case,
        preprocess_settings.mode)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    # prepare model
    device = torch.device('cuda' if opt.cuda else 'cpu')
    checkpoint = torch.load(opt.model)
    model_opt = checkpoint['settings']
    
    model_opt.bidirectional = True
    encoder = EncoderRNN(model_opt.src_vocab_size, model_opt.max_token_seq_len, model_opt.d_model,
                            bidirectional=model_opt.bidirectional, variable_lengths=True)
    decoder = DecoderRNN(model_opt.tgt_vocab_size, model_opt.max_token_seq_len, model_opt.d_model * 2 if model_opt.bidirectional else model_opt.d_model,
                            n_layers=model_opt.n_layer, dropout_p=model_opt.dropout, use_attention=True, bidirectional=model_opt.bidirectional,
                            eos_id=Constants.BOS, sos_id=Constants.EOS)
    model = Seq2seq(encoder, decoder).to(device)
    model = nn.DataParallel(model) # using Dataparallel because training used

    model.load_state_dict(checkpoint['model'])
    print('[Info] Trained model state loaded.')

    predictor = Predictor(model, preprocess_data['dict']['tgt'])

    with open(opt.output, 'w') as f:
        for src_seq in tqdm(test_src_insts, mininterval=2, desc='  - (Test)', leave=False):
            pred_line = ' '.join(predictor.predict(src_seq))
            f.write(pred_line + '\n')
    print('[Info] Finished.')
Ejemplo n.º 8
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-image_dir', required=True, help='image directory')
    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument('-img',
                        required=True,
                        help='Source image to decode (one line per sequence)')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-crop_size',
                        type=int,
                        default=224,
                        help='size for randomly cropping images')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_img_insts = read_instances_from_file(
        opt.img, preprocess_settings.max_word_seq_len)
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    # Image Preprocessing
    transform = transforms.Compose([
        transforms.RandomResizedCrop(opt.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    test_data = DataLoader(transform,
                           opt.image_dir,
                           preprocess_data['dict']['src'],
                           preprocess_data['dict']['tgt'],
                           image_insts=test_img_insts,
                           src_insts=test_src_insts,
                           cuda=opt.cuda,
                           shuffle=False,
                           batch_size=1)

    translator = Translator(opt)
    translator.model.eval()

    inv_map = {v: k for k, v in preprocess_data['dict']['tgt'].items()}

    target = open(opt.output, "wb")
    for batch in tqdm(test_data, mininterval=2, desc='  - (Test)',
                      leave=False):
        seq = translator.translate_batch(batch)
        if seq is None:
            line = "None\n"
            target.write(line.encode("utf-8"))
            continue
        seq = seq[1:-1]
        line = [inv_map[val] for val in seq]
        line = " ".join(line) + "\n"
        target.write(line.encode("utf-8"))
    target.close()
    print('[Info] Finished.')
Ejemplo n.º 9
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    # pdb.set_trace()
    # (Pdb) print(opt)
    # Namespace(batch_size=30, beam_size=5, cuda=True, model='trained.chkpt',
    #     n_best=1, no_cuda=False, output='pred.txt', src='data/multi30k/test.en.atok',
    #     vocab='data/multi30k.atok.low.pt')

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(opt)

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            all_hyp, all_scores = translator.translate_batch(*batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    pred_line = ' '.join([
                        test_loader.dataset.tgt_idx2word[idx]
                        for idx in idx_seq
                    ])
                    f.write(pred_line + '\n')
    print('[Info] Finished.')
Ejemplo n.º 10
0
def prep(train_src,
         train_tgt,
         valid_src,
         valid_tgt,
         save_data,
         max_word_seq_len=50,
         min_word_count=5,
         keep_case=True,
         share_vocab=True,
         vocab=None):

    max_token_seq_len = max_word_seq_len + 2  # include the <s> and </s>

    # opt = Settings(train_src, train_tgt, valid_src, valid_tgt, save_data, max_word_seq_len, min_word_count, keep_case, share_vocab, vocab)

    # # Training set
    # train_src_word_insts = prepro.read_instances_from_file(
    #     train_src, max_word_seq_len, keep_case)
    # train_tgt_word_insts = prepro.read_instances_from_file(
    #     train_tgt, max_word_seq_len, keep_case)

    # if len(train_src_word_insts) != len(train_tgt_word_insts):
    #     print('[Warning] The training instance count is not equal.')
    #     min_inst_count = min(len(train_src_word_insts), len(train_tgt_word_insts))
    #     train_src_word_insts = train_src_word_insts[:min_inst_count]
    #     train_tgt_word_insts = train_tgt_word_insts[:min_inst_count]

    # #- Remove empty instances
    # train_src_word_insts, train_tgt_word_insts = list(zip(*[
    #     (s, t) for s, t in zip(train_src_word_insts, train_tgt_word_insts) if s and t]))

    # # Validation set
    # valid_src_word_insts = prepro.read_instances_from_file(
    #     valid_src, max_word_seq_len, keep_case)
    # valid_tgt_word_insts = prepro.read_instances_from_file(
    #     valid_tgt, max_word_seq_len, keep_case)

    # if len(valid_src_word_insts) != len(valid_tgt_word_insts):
    #     print('[Warning] The validation instance count is not equal.')
    #     min_inst_count = min(len(valid_src_word_insts), len(valid_tgt_word_insts))
    #     valid_src_word_insts = valid_src_word_insts[:min_inst_count]
    #     valid_tgt_word_insts = valid_tgt_word_insts[:min_inst_count]

    # #- Remove empty instances
    # valid_src_word_insts, valid_tgt_word_insts = list(zip(*[
    #     (s, t) for s, t in zip(valid_src_word_insts, valid_tgt_word_insts) if s and t]))

    src_word_insts = prepro.read_instances_from_file(train_src,
                                                     max_word_seq_len,
                                                     keep_case)
    tgt_word_insts = prepro.read_instances_from_file(train_tgt,
                                                     max_word_seq_len,
                                                     keep_case)

    if len(train_src_word_insts) != len(train_tgt_word_insts):
        print('[Warning] The training instance count is not equal.')
        min_inst_count = min(len(train_src_word_insts),
                             len(train_tgt_word_insts))
        train_src_word_insts = train_src_word_insts[:min_inst_count]
        train_tgt_word_insts = train_tgt_word_insts[:min_inst_count]

    #- Remove empty instances
    train_src_word_insts, train_tgt_word_insts = list(
        zip(*[(s, t)
              for s, t in zip(train_src_word_insts, train_tgt_word_insts)
              if s and t]))

    # Build vocabulary
    if vocab:
        predefined_data = torch.load(vocab)
        assert 'dict' in predefined_data

        print('[Info] Pre-defined vocabulary found.')
        src_word2idx = predefined_data['dict']['src']
        tgt_word2idx = predefined_data['dict']['tgt']
    else:
        if share_vocab:
            print('[Info] Build shared vocabulary for source and target.')
            word2idx = prepro.build_vocab_idx(
                train_src_word_insts + train_tgt_word_insts, min_word_count)
            src_word2idx = tgt_word2idx = word2idx
        else:
            print('[Info] Build vocabulary for source.')
            src_word2idx = prepro.build_vocab_idx(train_src_word_insts,
                                                  min_word_count)
            print('[Info] Build vocabulary for target.')
            tgt_word2idx = prepro.build_vocab_idx(train_tgt_word_insts,
                                                  min_word_count)

    # word to index
    print('[Info] Convert source word instances into sequences of word index.')
    train_src_insts = prepro.convert_instance_to_idx_seq(
        train_src_word_insts, src_word2idx)
    valid_src_insts = prepro.convert_instance_to_idx_seq(
        valid_src_word_insts, src_word2idx)

    print('[Info] Convert target word instances into sequences of word index.')
    train_tgt_insts = prepro.convert_instance_to_idx_seq(
        train_tgt_word_insts, tgt_word2idx)
    valid_tgt_insts = prepro.convert_instance_to_idx_seq(
        valid_tgt_word_insts, tgt_word2idx)

    data = {
        'settings': max_token_seq_len,
        'dict': {
            'src': src_word2idx,
            'tgt': tgt_word2idx
        },
        'train': {
            'src': train_src_insts,
            'tgt': train_tgt_insts
        },
        'valid': {
            'src': valid_src_insts,
            'tgt': valid_tgt_insts
        }
    }

    print('[Info] Dumping the processed data to pickle file', save_data)
    torch.save(data, save_data)
    print('[Info] Finish.')
Ejemplo n.º 11
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True,
                        help='Path to model .pt file')
    parser.add_argument('-vocab', required=True,
                        help='Path to vocabulary file')
    parser.add_argument('-output',
                        help="""Path to output the predictions""")
    parser.add_argument('-beam_size', type=int, default=5,
                        help='Beam size')
    parser.add_argument('-n_best', type=int, default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    src_line = "Binary files a / build / linux / jre . tgz and b / build / linux / jre . tgz differ <nl>"

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances(
        src_line,
        preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(
        TranslationDataset(
            src_word2idx=preprocess_data['dict']['src'],
            tgt_word2idx=preprocess_data['dict']['tgt'],
            src_insts=test_src_insts),
        num_workers=2,
        batch_size=1,
        collate_fn=collate_fn)

    translator = Translator(opt)


    for batch in tqdm(test_loader, mininterval=1, desc='  - (Test)', leave=False):
        all_hyp, all_scores = translator.translate_batch(*batch)
        for idx_seqs in all_hyp:
            for idx_seq in idx_seqs:
                pred_line = ' '.join([test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq[:-1]])
            print(pred_line)
    
    sent = src_line.split()
    tgt_sent = pred_line.split()
    
    for layer in range(0, 2):
        fig, axs = plt.subplots(1,4, figsize=(20, 10))
        print("Encoder Layer", layer+1)
        for h in range(4):
            print(translator.model.encoder.layer_stack[layer].slf_attn.attn.data.cpu().size())
            draw(translator.model.encoder.layer_stack[layer].slf_attn.attn[h, :, :].data.cpu(), 
                sent, sent if h ==0 else [], ax=axs[h])
        plt.savefig(opt.output+"Encoder Layer %d.png" % layer)
        
    for layer in range(0, 2):
        fig, axs = plt.subplots(1,4, figsize=(20, 10))
        print("Decoder Self Layer", layer+1)
        for h in range(4):
            print(translator.model.decoder.layer_stack[layer].slf_attn.attn.data.cpu().size())
            draw(translator.model.decoder.layer_stack[layer].slf_attn.attn[:,:, h].data[:len(tgt_sent), :len(tgt_sent)].cpu(), 
                tgt_sent, tgt_sent if h ==0 else [], ax=axs[h])
        plt.savefig(opt.output+"Decoder Self Layer %d.png" % layer)

        print("Decoder Src Layer", layer+1)
        fig, axs = plt.subplots(1,4, figsize=(20, 10))
        for h in range(4):
            draw(translator.model.decoder.layer_stack[layer].slf_attn.attn[:,:, h].data[:len(sent), :len(tgt_sent)].cpu(), 
                tgt_sent, sent if h ==0 else [], ax=axs[h])
        plt.savefig(opt.output+"Decoder Src Layer %d.png" % layer)
                    
    print('[Info] Finished.')
Ejemplo n.º 12
0
def main():
    '''Main Function'''

    '''
    这个模型是从英语到德语.
    '''









    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=False,
                        help='Path to model .pt file')
    parser.add_argument('-src', required=False,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-vocab', required=False,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output', default='2',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5,
                        help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30,
                        help='Batch size')
    parser.add_argument('-n_best', type=int, default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')



    #-vocab data/multi30k.atok.low.pt







    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.cuda=False
    opt.model='trained.chkpt'
    opt.src='1'
    opt.vocab='multi30k.atok.low.pt'
    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)

    tmp1=preprocess_data['dict']['src']
    tmp2=preprocess_data['dict']['tgt']
    with open('55','w')as f:
        f.write(str(tmp1))

    with open('66','w',encoding='utf-8')as f:
        f.write(str(tmp2))





    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src,
        preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(
        TranslationDataset(
            src_word2idx=preprocess_data['dict']['src'],
            tgt_word2idx=preprocess_data['dict']['tgt'],
            src_insts=test_src_insts),
        num_workers=2,
        batch_size=opt.batch_size,
        collate_fn=collate_fn)

    translator = Translator(opt)

    with open(opt.output, 'w') as f:
        for batch in test_loader:
            all_hyp, all_scores = translator.translate_batch(*batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    print(idx_seq)
                    pred_line = ' '.join([test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq]) # 把id转化会text
                    f.write(pred_line + '\n')
    print('[Info] Finished.')
Ejemplo n.º 13
0
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(opt)

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc='  - (Test)',
Ejemplo n.º 14
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='openie_extract.py')

    parser.add_argument('-model', required=True,
                        help='Path to model .pt file')
    parser.add_argument('-sent', required=True,
                        help='Source sentence to extract from in raw format')
    parser.add_argument('-vocab', required=True,
                        help='training data which contains necessary information')
    parser.add_argument('-output', default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the extraction""")
    parser.add_argument('-beam_size', type=int, default=5,
                        help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30,
                        help='Batch size')
    parser.add_argument('-n_best', type=int, default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    pset = preprocess_data['settings']
    test_raw_sent_insts, test_word_insts, test_pred_idx_insts, \
    test_pred_word_insts, test_pred_pos_insts, test_pos_insts, test_path_insts = \
        read_instances_from_raw_sentence(opt.sent, pset)
    test_word_insts = convert_instance_to_idx_seq(
        test_word_insts, preprocess_data['word2idx'])
    test_pos_insts = convert_instance_to_idx_seq(
        test_pos_insts, preprocess_data['pos2idx'])
    test_pred_word_insts = convert_instance_to_idx_seq(
        test_pred_word_insts, preprocess_data['word2idx'])
    test_pred_pos_insts = convert_instance_to_idx_seq(
        test_pred_pos_insts, preprocess_data['pos2idx'])
    test_path_insts = convert_path_instance_to_idx_seq(
        test_path_insts, preprocess_data['path2idx'])
    twc = concat_inp(test_word_insts, test_pos_insts, test_pred_idx_insts,
                     test_pred_word_insts, test_pred_pos_insts)

    test_loader = torch.utils.data.DataLoader(
        OpenIEDataset(
            word2idx=preprocess_data['word2idx'],
            tag2idx=preprocess_data['tag2idx'],
            word_insts=twc,
            path_insts=test_path_insts),
        num_workers=2,
        batch_size=opt.batch_size,
        collate_fn=openie_collate_fn)

    tagger = Tagger(opt)

    cur = 0
    with open(opt.output, 'w') as fout:
        for batch in tqdm(test_loader, mininterval=2, desc='  - (Test)', leave=False):
            probs, tags = tagger.tag_batch(*batch, skip_first=2) # skip PAD and UNK
            tag_probs = torch.cat([torch.unsqueeze(tags, -1).float(), torch.unsqueeze(probs, -1)], dim=-1)
            tag_probs = tag_probs.cpu().numpy()
            sent_list = test_raw_sent_insts[cur : cur + opt.batch_size]
            pred_list = test_pred_idx_insts[cur : cur + opt.batch_size]
            tag_prob_list = [[(test_loader.dataset.idx2tag[t], p) for t, p in tps] for tps in tag_probs]
            exts = tag2extraction(sent_list, pred_list, tag_prob_list, pred_idx=2)
            for ext in exts:
                fout.write('{}\n'.format(ext))
            cur += opt.batch_size
    print('[Info] Finished.')
Ejemplo n.º 15
0
def main():
    """Main Function"""

    parser = argparse.ArgumentParser(description="translate.py")

    parser.add_argument("-model", required=True, help="Path to model .pt file")
    parser.add_argument(
        "-src",
        required=True,
        help="Source sequence to decode (one line per sequence)")
    parser.add_argument(
        "-vocab",
        required=True,
        help="Source sequence to decode (one line per sequence)")
    parser.add_argument("-output",
                        default="pred.txt",
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument("-beam_size", type=int, default=5, help="Beam size")
    parser.add_argument("-batch_size", type=int, default=30, help="Batch size")
    parser.add_argument("-n_best",
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument("-no_cuda", action="store_true")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data["settings"]
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data["dict"]["src"])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data["dict"]["src"],
        tgt_word2idx=preprocess_data["dict"]["tgt"],
        src_insts=test_src_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(opt)

    with open(opt.output, "w") as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc="  - (Test)",
                          leave=False):
            all_hyp, all_scores = translator.translate_batch(*batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    pred_line = " ".join([
                        test_loader.dataset.tgt_idx2word[idx]
                        for idx in idx_seq
                    ])
                    f.write(pred_line + "\n")
    print("[Info] Finished.")
def main():
    """Main Function"""

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument('-src',
                        required=True,
                        help='Source sequence to decode '
                        '(one line per sequence)')
    parser.add_argument('-tgt',
                        required=True,
                        help='Target sequence to decode '
                        '(one line per sequence)')
    parser.add_argument('-vocab',
                        required=True,
                        help='Source sequence to decode '
                        '(one line per sequence)')
    parser.add_argument('-log',
                        default='translate_log.txt',
                        help="""Path to log the translation(test_inference) 
                        loss""")
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=2, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_tgt_word_insts = read_instances_from_file(
        opt.tgt, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])
    test_tgt_insts = convert_instance_to_idx_seq(
        test_tgt_word_insts, preprocess_data['dict']['tgt'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts,
        tgt_insts=test_tgt_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=paired_collate_fn)

    translator = Translator(opt)

    n_word_total = 0
    n_word_correct = 0

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            # all_hyp, all_scores = translator.translate_batch(*batch)
            all_hyp, all_scores = translator.translate_batch(
                batch[0], batch[1])

            # print(all_hyp)
            # print(all_hyp[0])
            # print(len(all_hyp[0]))

            # pad with 0's fit to max_len in insts_group
            src_seqs = batch[0]
            # print(src_seqs.shape)
            tgt_seqs = batch[2]
            # print(tgt_seqs.shape)
            gold = tgt_seqs[:, 1:]
            # print(gold.shape)
            max_len = gold.shape[1]

            pred_seq = []
            for item in all_hyp:
                curr_item = item[0]
                curr_len = len(curr_item)
                # print(curr_len, max_len)
                # print(curr_len)
                if curr_len < max_len:
                    diff = max_len - curr_len
                    curr_item.extend([0] * diff)
                else:  # TODO: why does this case happen?
                    curr_item = curr_item[:max_len]
                pred_seq.append(curr_item)
            pred_seq = torch.LongTensor(np.array(pred_seq))
            pred_seq = pred_seq.view(opt.batch_size * max_len)

            n_correct = cal_performance(pred_seq, gold)

            non_pad_mask = gold.ne(Constants.PAD)
            n_word = non_pad_mask.sum().item()
            n_word_total += n_word
            n_word_correct += n_correct

            # trs_log = "transformer_loss: {} |".format(trs_loss)
            #
            # with open(opt.log, 'a') as log_tf:
            #     log_tf.write(trs_log + '\n')

            count = 0
            for pred_seqs in all_hyp:
                src_seq = src_seqs[count]
                tgt_seq = tgt_seqs[count]
                for pred_seq in pred_seqs:
                    src_line = ' '.join([
                        test_loader.dataset.src_idx2word[idx]
                        for idx in src_seq.data.cpu().numpy()
                    ])
                    tgt_line = ' '.join([
                        test_loader.dataset.tgt_idx2word[idx]
                        for idx in tgt_seq.data.cpu().numpy()
                    ])
                    pred_line = ' '.join([
                        test_loader.dataset.tgt_idx2word[idx]
                        for idx in pred_seq
                    ])
                    f.write(
                        "\n ----------------------------------------------------------------------------------------------------------------------------------------------  \n"
                    )
                    f.write("\n [src]  " + src_line + '\n')
                    f.write("\n [tgt]  " + tgt_line + '\n')
                    f.write("\n [pred] " + pred_line + '\n')

                    count += 1

        accuracy = n_word_correct / n_word_total
        accr_log = "accuracy: {} |".format(accuracy)
        # print(accr_log)

        with open(opt.log, 'a') as log_tf:
            log_tf.write(accr_log + '\n')

    print('[Info] Finished.')
Ejemplo n.º 17
0
def main():

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model',
                        default='trained.chkpt',
                        help='Path to model .pt file')
    parser.add_argument(
        '-src',
        default='data/multi30k/test.en.atok',
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-ctx',
        required=False,
        default="",
        help='Context sequence to decode (one line per sequence)')
    parser.add_argument('-vocab',
                        default='data/multi30k.atok.low.pt',
                        help='Data that contains the source vocabulary')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_false')
    parser.add_argument('-max_token_seq_len', type=int, default=100)

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']

    test_src_word_insts = read_instances_from_file(
        opt.src, opt.max_token_seq_len, preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    if opt.ctx:
        from preprocess_ctx import read_instances_from_file as read_instances_from_file_ctx
        test_ctx_word_insts = read_instances_from_file_ctx(
            opt.ctx,
            opt.max_token_seq_len,
            preprocess_settings.keep_case,
            is_ctx=True)
        test_ctx_insts = convert_instance_to_idx_seq(
            test_ctx_word_insts, preprocess_data['dict']['src'])

    test_data = DataLoader(preprocess_data['dict']['src'],
                           preprocess_data['dict']['tgt'],
                           src_insts=test_src_insts,
                           ctx_insts=(test_ctx_insts if opt.ctx else None),
                           cuda=opt.cuda,
                           shuffle=False,
                           batch_size=opt.batch_size,
                           is_train=False)

    translator = Translator(opt)
    translator.model.eval()

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_data,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            print(---------1111111111)
            all_hyp, all_scores = translator.translate_batch(*batch)
            print(---------2222222222)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    if idx_seq[-1] == 3:  # if last word is EOS
                        idx_seq = idx_seq[:-1]
                    pred_line = ' '.join(
                        [test_data.tgt_idx2word[int(idx)] for idx in idx_seq])
                    f.write(pred_line + '\n')
            print("end")

    print('[Info] Finished.')