def test_build_vocab_idx():
    inst_file = '/home/peng/Workspace/data/multi30k/train.en'
    min_word_count = 2
    words_inst = read_instances_from_file(inst_file, 30, False)
    words_inst = [w for w in words_inst if w]
    word2idx = build_vocab_idx(words_inst, min_word_count)
    print(word2idx)
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])
    test_data = DataLoader(preprocess_data['dict']['src'],
                           preprocess_data['dict']['tgt'],
                           src_insts=test_src_insts,
                           cuda=opt.cuda,
                           shuffle=False,
                           batch_size=opt.batch_size)

    translator = Translator(opt)
    translator.model.eval()

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_data,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            all_hyp, all_scores = translator.translate_batch(batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    pred_line = ' '.join(
                        [test_data.tgt_idx2word[idx] for idx in idx_seq])
                    f.write(pred_line + '\n')
    print('[Info] Finished.')
def test_read_instances_from_file():
    inst_file = '/home/peng/Workspace/data/multi30k/train.en'
    max_sent_len = 40
    keep_case = True
    words_inst = read_instances_from_file(inst_file, max_sent_len, keep_case)
    for sent in words_inst:
        print(sent)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                            be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                            decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')  # 有动作就设置为true

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)
    translator = Translator(opt)

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            all_hyp, all_scores = translator.translate_batch(*batch)
            for hyp_stream in all_hyp:
                for hyp in hyp_stream:
                    pred_sent = ' '.join(
                        [test_loader.dataset.tgt_idx2word[idx] for idx in hyp])
                    f.write(pred_sent + '\n')
    print('[Info] Finished')
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True,
                        help='Path to model .pt file')
    parser.add_argument('-src', required=True,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-vocab', required=True,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output', default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5,
                        help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30,
                        help='Batch size')
    parser.add_argument('-n_best', type=int, default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src,
        preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])
    test_data = DataLoader(
        preprocess_data['dict']['src'],
        preprocess_data['dict']['tgt'],
        src_insts=test_src_insts,
        cuda=opt.cuda,
        shuffle=False,
        batch_size=opt.batch_size)

    translator = Translator(opt)
    translator.model.eval()

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_data, mininterval=2, desc='  - (Test)', leave=False):
            all_hyp, all_scores = translator.translate_batch(batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    pred_line = ' '.join([test_data.tgt_idx2word[idx] for idx in idx_seq])
                    f.write(pred_line + '\n')
    print('[Info] Finished.')
def main():
    """Main Function"""

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument('-src',
                        required=True,
                        help='Source sequence to decode '
                        '(one line per sequence)')
    parser.add_argument('-tgt',
                        required=True,
                        help='Target sequence to decode '
                        '(one line per sequence)')
    parser.add_argument('-vocab',
                        required=True,
                        help='Source sequence to decode '
                        '(one line per sequence)')
    parser.add_argument('-log',
                        default='translate_log.txt',
                        help="""Path to log the translation(test_inference) 
                        loss""")
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=2, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_tgt_word_insts = read_instances_from_file(
        opt.tgt, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])
    test_tgt_insts = convert_instance_to_idx_seq(
        test_tgt_word_insts, preprocess_data['dict']['tgt'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts,
        tgt_insts=test_tgt_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=paired_collate_fn)

    translator = Translator(opt)

    n_word_total = 0
    n_word_correct = 0

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            # all_hyp, all_scores = translator.translate_batch(*batch)
            all_hyp, all_scores = translator.translate_batch(
                batch[0], batch[1])

            # print(all_hyp)
            # print(all_hyp[0])
            # print(len(all_hyp[0]))

            # pad with 0's fit to max_len in insts_group
            src_seqs = batch[0]
            # print(src_seqs.shape)
            tgt_seqs = batch[2]
            # print(tgt_seqs.shape)
            gold = tgt_seqs[:, 1:]
            # print(gold.shape)
            max_len = gold.shape[1]

            pred_seq = []
            for item in all_hyp:
                curr_item = item[0]
                curr_len = len(curr_item)
                # print(curr_len, max_len)
                # print(curr_len)
                if curr_len < max_len:
                    diff = max_len - curr_len
                    curr_item.extend([0] * diff)
                else:  # TODO: why does this case happen?
                    curr_item = curr_item[:max_len]
                pred_seq.append(curr_item)
            pred_seq = torch.LongTensor(np.array(pred_seq))
            pred_seq = pred_seq.view(opt.batch_size * max_len)

            n_correct = cal_performance(pred_seq, gold)

            non_pad_mask = gold.ne(Constants.PAD)
            n_word = non_pad_mask.sum().item()
            n_word_total += n_word
            n_word_correct += n_correct

            # trs_log = "transformer_loss: {} |".format(trs_loss)
            #
            # with open(opt.log, 'a') as log_tf:
            #     log_tf.write(trs_log + '\n')

            count = 0
            for pred_seqs in all_hyp:
                src_seq = src_seqs[count]
                tgt_seq = tgt_seqs[count]
                for pred_seq in pred_seqs:
                    src_line = ' '.join([
                        test_loader.dataset.src_idx2word[idx]
                        for idx in src_seq.data.cpu().numpy()
                    ])
                    tgt_line = ' '.join([
                        test_loader.dataset.tgt_idx2word[idx]
                        for idx in tgt_seq.data.cpu().numpy()
                    ])
                    pred_line = ' '.join([
                        test_loader.dataset.tgt_idx2word[idx]
                        for idx in pred_seq
                    ])
                    f.write(
                        "\n ----------------------------------------------------------------------------------------------------------------------------------------------  \n"
                    )
                    f.write("\n [src]  " + src_line + '\n')
                    f.write("\n [tgt]  " + tgt_line + '\n')
                    f.write("\n [pred] " + pred_line + '\n')

                    count += 1

        accuracy = n_word_correct / n_word_total
        accr_log = "accuracy: {} |".format(accuracy)
        # print(accr_log)

        with open(opt.log, 'a') as log_tf:
            log_tf.write(accr_log + '\n')

    print('[Info] Finished.')
Beispiel #7
0
def main():
    """Main Function"""

    parser = argparse.ArgumentParser(description="translate.py")

    parser.add_argument("-model", required=True, help="Path to model .pt file")
    parser.add_argument(
        "-src",
        required=True,
        help="Source sequence to decode (one line per sequence)")
    parser.add_argument(
        "-vocab",
        required=True,
        help="Source sequence to decode (one line per sequence)")
    parser.add_argument("-output",
                        default="pred.txt",
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument("-beam_size", type=int, default=5, help="Beam size")
    parser.add_argument("-batch_size", type=int, default=30, help="Batch size")
    parser.add_argument("-n_best",
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument("-no_cuda", action="store_true")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data["settings"]
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data["dict"]["src"])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data["dict"]["src"],
        tgt_word2idx=preprocess_data["dict"]["tgt"],
        src_insts=test_src_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(opt)

    with open(opt.output, "w") as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc="  - (Test)",
                          leave=False):
            all_hyp, all_scores = translator.translate_batch(*batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    pred_line = " ".join([
                        test_loader.dataset.tgt_idx2word[idx]
                        for idx in idx_seq
                    ])
                    f.write(pred_line + "\n")
    print("[Info] Finished.")
Beispiel #8
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-vocab',
                        required=True,
                        help='preprocess file to provide vocabulary')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-lambda_1',
                        type=float,
                        default=2 / 3,
                        help='diversity factor for hamming diversity')
    parser.add_argument('-lambda_2',
                        type=float,
                        default=2 / 3,
                        help='diversity factor for bi-gram diversity')
    parser.add_argument('-lambda_3',
                        type=float,
                        default=2 / 3,
                        help='diversity factor for tri-gram diversity')
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']

    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)

    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_data = DataLoader(preprocess_data['dict']['src'],
                           preprocess_data['dict']['tgt'],
                           src_insts=test_src_insts,
                           cuda=opt.cuda,
                           shuffle=False,
                           batch_size=opt.batch_size)

    translator = Translator_idbs(opt)
    translator.model.eval()

    print('[Info] Start translating...')
    f = open(opt.output, 'w')
    for batch in tqdm(test_data, mininterval=2, desc='  - (Test)',
                      leave=False):
        all_hyp = translator.translate_batch(batch)
        for idx_seq in all_hyp:
            pred_line = ' '.join(
                [test_data.tgt_idx2word[idx] for idx in idx_seq])  #转化成单词拼接起来
            f.write(pred_line + '\n')
            f.flush()
    f.close()
    print('[Info] Finished.')
Beispiel #9
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-target',
        required=True,
        help='Target sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    parser.add_argument('-prune', action='store_true')
    parser.add_argument('-prune_alpha', type=float, default=0.1)
    parser.add_argument('-load_mask', type=str, default=None)

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']

    refs = read_instances_from_file(opt.target,
                                    preprocess_settings.max_word_seq_len,
                                    preprocess_settings.keep_case)

    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts,
    ),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(opt)

    preds = []
    preds_text = []

    for batch in tqdm(test_loader,
                      mininterval=2,
                      desc='  - (Test)',
                      leave=False):
        all_hyp, all_scores = translator.translate_batch(*batch)
        for idx_seqs in all_hyp:
            for idx_seq in idx_seqs:
                sent = ' '.join(
                    [test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq])
                sent = sent.split("</s>")[0].strip()
                sent = sent.replace("▁", " ")
                preds_text.append(sent.strip())
                preds.append(
                    [test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq])
    with open(opt.output, 'w') as f:
        f.write('\n'.join(preds_text))

    from evaluator import BLEUEvaluator
    scorer = BLEUEvaluator()
    length = min(len(preds), len(refs))
    score = scorer.evaluate(refs[:length], preds[:length])
    print(score)
Beispiel #10
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='sum_file.py')

    parser.add_argument('-model', required=True,
                        help='Path to model .pt file')
    parser.add_argument('-src', required=True,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-vocab', required=True,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output', default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5,
                        help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30,
                        help='Batch size')
    parser.add_argument('-n_best', type=int, default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src,
        preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case,
        preprocess_settings.mode)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    # prepare model
    device = torch.device('cuda' if opt.cuda else 'cpu')
    checkpoint = torch.load(opt.model)
    model_opt = checkpoint['settings']
    
    model_opt.bidirectional = True
    encoder = EncoderRNN(model_opt.src_vocab_size, model_opt.max_token_seq_len, model_opt.d_model,
                            bidirectional=model_opt.bidirectional, variable_lengths=True)
    decoder = DecoderRNN(model_opt.tgt_vocab_size, model_opt.max_token_seq_len, model_opt.d_model * 2 if model_opt.bidirectional else model_opt.d_model,
                            n_layers=model_opt.n_layer, dropout_p=model_opt.dropout, use_attention=True, bidirectional=model_opt.bidirectional,
                            eos_id=Constants.BOS, sos_id=Constants.EOS)
    model = Seq2seq(encoder, decoder).to(device)
    model = nn.DataParallel(model) # using Dataparallel because training used

    model.load_state_dict(checkpoint['model'])
    print('[Info] Trained model state loaded.')

    predictor = Predictor(model, preprocess_data['dict']['tgt'])

    with open(opt.output, 'w') as f:
        for src_seq in tqdm(test_src_insts, mininterval=2, desc='  - (Test)', leave=False):
            pred_line = ' '.join(predictor.predict(src_seq))
            f.write(pred_line + '\n')
    print('[Info] Finished.')
Beispiel #11
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    # pdb.set_trace()
    # (Pdb) print(opt)
    # Namespace(batch_size=30, beam_size=5, cuda=True, model='trained.chkpt',
    #     n_best=1, no_cuda=False, output='pred.txt', src='data/multi30k/test.en.atok',
    #     vocab='data/multi30k.atok.low.pt')

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(opt)

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_loader,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            all_hyp, all_scores = translator.translate_batch(*batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    pred_line = ' '.join([
                        test_loader.dataset.tgt_idx2word[idx]
                        for idx in idx_seq
                    ])
                    f.write(pred_line + '\n')
    print('[Info] Finished.')
Beispiel #12
0
def prep(train_src,
         train_tgt,
         valid_src,
         valid_tgt,
         save_data,
         max_word_seq_len=50,
         min_word_count=5,
         keep_case=True,
         share_vocab=True,
         vocab=None):

    max_token_seq_len = max_word_seq_len + 2  # include the <s> and </s>

    # opt = Settings(train_src, train_tgt, valid_src, valid_tgt, save_data, max_word_seq_len, min_word_count, keep_case, share_vocab, vocab)

    # # Training set
    # train_src_word_insts = prepro.read_instances_from_file(
    #     train_src, max_word_seq_len, keep_case)
    # train_tgt_word_insts = prepro.read_instances_from_file(
    #     train_tgt, max_word_seq_len, keep_case)

    # if len(train_src_word_insts) != len(train_tgt_word_insts):
    #     print('[Warning] The training instance count is not equal.')
    #     min_inst_count = min(len(train_src_word_insts), len(train_tgt_word_insts))
    #     train_src_word_insts = train_src_word_insts[:min_inst_count]
    #     train_tgt_word_insts = train_tgt_word_insts[:min_inst_count]

    # #- Remove empty instances
    # train_src_word_insts, train_tgt_word_insts = list(zip(*[
    #     (s, t) for s, t in zip(train_src_word_insts, train_tgt_word_insts) if s and t]))

    # # Validation set
    # valid_src_word_insts = prepro.read_instances_from_file(
    #     valid_src, max_word_seq_len, keep_case)
    # valid_tgt_word_insts = prepro.read_instances_from_file(
    #     valid_tgt, max_word_seq_len, keep_case)

    # if len(valid_src_word_insts) != len(valid_tgt_word_insts):
    #     print('[Warning] The validation instance count is not equal.')
    #     min_inst_count = min(len(valid_src_word_insts), len(valid_tgt_word_insts))
    #     valid_src_word_insts = valid_src_word_insts[:min_inst_count]
    #     valid_tgt_word_insts = valid_tgt_word_insts[:min_inst_count]

    # #- Remove empty instances
    # valid_src_word_insts, valid_tgt_word_insts = list(zip(*[
    #     (s, t) for s, t in zip(valid_src_word_insts, valid_tgt_word_insts) if s and t]))

    src_word_insts = prepro.read_instances_from_file(train_src,
                                                     max_word_seq_len,
                                                     keep_case)
    tgt_word_insts = prepro.read_instances_from_file(train_tgt,
                                                     max_word_seq_len,
                                                     keep_case)

    if len(train_src_word_insts) != len(train_tgt_word_insts):
        print('[Warning] The training instance count is not equal.')
        min_inst_count = min(len(train_src_word_insts),
                             len(train_tgt_word_insts))
        train_src_word_insts = train_src_word_insts[:min_inst_count]
        train_tgt_word_insts = train_tgt_word_insts[:min_inst_count]

    #- Remove empty instances
    train_src_word_insts, train_tgt_word_insts = list(
        zip(*[(s, t)
              for s, t in zip(train_src_word_insts, train_tgt_word_insts)
              if s and t]))

    # Build vocabulary
    if vocab:
        predefined_data = torch.load(vocab)
        assert 'dict' in predefined_data

        print('[Info] Pre-defined vocabulary found.')
        src_word2idx = predefined_data['dict']['src']
        tgt_word2idx = predefined_data['dict']['tgt']
    else:
        if share_vocab:
            print('[Info] Build shared vocabulary for source and target.')
            word2idx = prepro.build_vocab_idx(
                train_src_word_insts + train_tgt_word_insts, min_word_count)
            src_word2idx = tgt_word2idx = word2idx
        else:
            print('[Info] Build vocabulary for source.')
            src_word2idx = prepro.build_vocab_idx(train_src_word_insts,
                                                  min_word_count)
            print('[Info] Build vocabulary for target.')
            tgt_word2idx = prepro.build_vocab_idx(train_tgt_word_insts,
                                                  min_word_count)

    # word to index
    print('[Info] Convert source word instances into sequences of word index.')
    train_src_insts = prepro.convert_instance_to_idx_seq(
        train_src_word_insts, src_word2idx)
    valid_src_insts = prepro.convert_instance_to_idx_seq(
        valid_src_word_insts, src_word2idx)

    print('[Info] Convert target word instances into sequences of word index.')
    train_tgt_insts = prepro.convert_instance_to_idx_seq(
        train_tgt_word_insts, tgt_word2idx)
    valid_tgt_insts = prepro.convert_instance_to_idx_seq(
        valid_tgt_word_insts, tgt_word2idx)

    data = {
        'settings': max_token_seq_len,
        'dict': {
            'src': src_word2idx,
            'tgt': tgt_word2idx
        },
        'train': {
            'src': train_src_insts,
            'tgt': train_tgt_insts
        },
        'valid': {
            'src': valid_src_insts,
            'tgt': valid_tgt_insts
        }
    }

    print('[Info] Dumping the processed data to pickle file', save_data)
    torch.save(data, save_data)
    print('[Info] Finish.')
Beispiel #13
0
def main():
    '''Main Function'''

    '''
    这个模型是从英语到德语.
    '''









    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=False,
                        help='Path to model .pt file')
    parser.add_argument('-src', required=False,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-vocab', required=False,
                        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output', default='2',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5,
                        help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30,
                        help='Batch size')
    parser.add_argument('-n_best', type=int, default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')



    #-vocab data/multi30k.atok.low.pt







    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.cuda=False
    opt.model='trained.chkpt'
    opt.src='1'
    opt.vocab='multi30k.atok.low.pt'
    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)

    tmp1=preprocess_data['dict']['src']
    tmp2=preprocess_data['dict']['tgt']
    with open('55','w')as f:
        f.write(str(tmp1))

    with open('66','w',encoding='utf-8')as f:
        f.write(str(tmp2))





    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src,
        preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(
        TranslationDataset(
            src_word2idx=preprocess_data['dict']['src'],
            tgt_word2idx=preprocess_data['dict']['tgt'],
            src_insts=test_src_insts),
        num_workers=2,
        batch_size=opt.batch_size,
        collate_fn=collate_fn)

    translator = Translator(opt)

    with open(opt.output, 'w') as f:
        for batch in test_loader:
            all_hyp, all_scores = translator.translate_batch(*batch)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    print(idx_seq)
                    pred_line = ' '.join([test_loader.dataset.tgt_idx2word[idx] for idx in idx_seq]) # 把id转化会text
                    f.write(pred_line + '\n')
    print('[Info] Finished.')
Beispiel #14
0
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len,
        preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    test_loader = torch.utils.data.DataLoader(TranslationDataset(
        src_word2idx=preprocess_data['dict']['src'],
        tgt_word2idx=preprocess_data['dict']['tgt'],
        src_insts=test_src_insts),
                                              num_workers=2,
                                              batch_size=opt.batch_size,
                                              collate_fn=collate_fn)

    translator = Translator(opt)

    with open(opt.output, 'w') as f:
Beispiel #15
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-image_dir', required=True, help='image directory')
    parser.add_argument('-model', required=True, help='Path to model .pt file')
    parser.add_argument('-img',
                        required=True,
                        help='Source image to decode (one line per sequence)')
    parser.add_argument(
        '-src',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-vocab',
        required=True,
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-crop_size',
                        type=int,
                        default=224,
                        help='size for randomly cropping images')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']
    test_img_insts = read_instances_from_file(
        opt.img, preprocess_settings.max_word_seq_len)
    test_src_word_insts = read_instances_from_file(
        opt.src, preprocess_settings.max_word_seq_len)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    # Image Preprocessing
    transform = transforms.Compose([
        transforms.RandomResizedCrop(opt.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    test_data = DataLoader(transform,
                           opt.image_dir,
                           preprocess_data['dict']['src'],
                           preprocess_data['dict']['tgt'],
                           image_insts=test_img_insts,
                           src_insts=test_src_insts,
                           cuda=opt.cuda,
                           shuffle=False,
                           batch_size=1)

    translator = Translator(opt)
    translator.model.eval()

    inv_map = {v: k for k, v in preprocess_data['dict']['tgt'].items()}

    target = open(opt.output, "wb")
    for batch in tqdm(test_data, mininterval=2, desc='  - (Test)',
                      leave=False):
        seq = translator.translate_batch(batch)
        if seq is None:
            line = "None\n"
            target.write(line.encode("utf-8"))
            continue
        seq = seq[1:-1]
        line = [inv_map[val] for val in seq]
        line = " ".join(line) + "\n"
        target.write(line.encode("utf-8"))
    target.close()
    print('[Info] Finished.')
Beispiel #16
0
def main():

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model',
                        default='trained.chkpt',
                        help='Path to model .pt file')
    parser.add_argument(
        '-src',
        default='data/multi30k/test.en.atok',
        help='Source sequence to decode (one line per sequence)')
    parser.add_argument(
        '-ctx',
        required=False,
        default="",
        help='Context sequence to decode (one line per sequence)')
    parser.add_argument('-vocab',
                        default='data/multi30k.atok.low.pt',
                        help='Data that contains the source vocabulary')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-batch_size', type=int, default=30, help='Batch size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")
    parser.add_argument('-no_cuda', action='store_false')
    parser.add_argument('-max_token_seq_len', type=int, default=100)

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # Prepare DataLoader
    preprocess_data = torch.load(opt.vocab)
    preprocess_settings = preprocess_data['settings']

    test_src_word_insts = read_instances_from_file(
        opt.src, opt.max_token_seq_len, preprocess_settings.keep_case)
    test_src_insts = convert_instance_to_idx_seq(
        test_src_word_insts, preprocess_data['dict']['src'])

    if opt.ctx:
        from preprocess_ctx import read_instances_from_file as read_instances_from_file_ctx
        test_ctx_word_insts = read_instances_from_file_ctx(
            opt.ctx,
            opt.max_token_seq_len,
            preprocess_settings.keep_case,
            is_ctx=True)
        test_ctx_insts = convert_instance_to_idx_seq(
            test_ctx_word_insts, preprocess_data['dict']['src'])

    test_data = DataLoader(preprocess_data['dict']['src'],
                           preprocess_data['dict']['tgt'],
                           src_insts=test_src_insts,
                           ctx_insts=(test_ctx_insts if opt.ctx else None),
                           cuda=opt.cuda,
                           shuffle=False,
                           batch_size=opt.batch_size,
                           is_train=False)

    translator = Translator(opt)
    translator.model.eval()

    with open(opt.output, 'w') as f:
        for batch in tqdm(test_data,
                          mininterval=2,
                          desc='  - (Test)',
                          leave=False):
            print(---------1111111111)
            all_hyp, all_scores = translator.translate_batch(*batch)
            print(---------2222222222)
            for idx_seqs in all_hyp:
                for idx_seq in idx_seqs:
                    if idx_seq[-1] == 3:  # if last word is EOS
                        idx_seq = idx_seq[:-1]
                    pred_line = ' '.join(
                        [test_data.tgt_idx2word[int(idx)] for idx in idx_seq])
                    f.write(pred_line + '\n')
            print("end")

    print('[Info] Finished.')