Beispiel #1
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    opts.train_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    # engine = DBEngine(opt.db_file)

    with codecs.open(opt.source_file, "r", "utf-8") as corpus_file:
        sql_list = [json.loads(line)['sql'] for line in corpus_file]

    js_list = table.IO.read_anno_json(opt.anno)

    prev_best = (None, None)
    for fn_model in glob.glob(opt.model_path):
        print(fn_model)
        print(opt.anno)
        opt.model = fn_model

        translator = table.Translator(opt, dummy_opt.__dict__)
        data = table.IO.TableDataset(js_list, translator.fields, None, False)
        test_data = table.IO.OrderedIterator(dataset=data,
                                             device=opt.gpu,
                                             batch_size=opt.batch_size,
                                             train=False,
                                             sort=True,
                                             sort_within_batch=False)

        # inference
        r_list = []
        for batch in test_data:
            r_list += translator.translate(batch)
        r_list.sort(key=lambda x: x.idx)
Beispiel #2
0
def _get_parser():
    parser = ArgumentParser(description='train.py')
    # Construct config
    opts.config_opts(parser)
    opts.model_opts(parser)
    opts.train_opts(parser)
    return parser
Beispiel #3
0
    def __init__(self, model, lang, gpu=False, wx=False):
        self.lang = lang
        self.is_ip_wx = wx
        parser = argparse.ArgumentParser(
            description='transliterate.py',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        self.opt = parser.parse_args()
        self.trans_dict = dict()
        self.broken_words = dict()
        file_path = os.path.dirname(os.path.abspath(__file__))

        if self.lang == 'hin':
            self.to_utf = WXC(order='wx2utf', lang='hin')
            self.non_alpha = re.compile(u'([^a-zA-Z]+)')
            self.alpha_letters = set(string.ascii_letters)
            self.com_abbr = {
                'b': ['BI', 'be'],
                'd': ['xI', 'xe'],
                'g': ['jI'],
                'k': ['ke', 'ki', 'kI'],
                'h': ['hE', 'hEM'],
                'ha': ['hE', 'hEM'],
                'n': ['ina', 'ne'],
                'm': ['meM', 'mEM'],
                'p': ['pe'],
                'q': ['kyoM'],
                'r': ['Ora', 'ora'],
                's': ['isa', 'se'],
                'y': ['ye']
            }

        if self.lang == 'eng':
            self.non_alpha = re.compile(u'([^a-z]+)')
            self.alpha_letters = set(string.ascii_letters[:26])
            with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp:
                self.com_abbr = {}
                for line in fp:
                    k, v = line.split()
                    self.com_abbr[k] = v.split('|')

        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]
        if gpu:
            self.opt.gpu = 0

        self.opt.cuda = self.opt.gpu > -1
        self.opt.model = model
        self.opt.n_best = 5
        self.opt.lang = lang
        if self.opt.cuda:
            torch.cuda.set_device(self.opt.gpu)

        # Load the model.
        self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model(
            self.opt, dummy_opt.__dict__)
Beispiel #4
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

    # File to write scores to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(
        fields,
        opt.data_type,
        opt.src,
        opt.tgt,
        src_dir=opt.src_dir,
        sample_rate=opt.sample_rate,
        window_size=opt.window_size,
        window_stride=opt.window_stride,
        window=opt.window,
        use_filter_pred=False,
        symbol_representation=opt.symbol_representation,
        revert_targets=opt.revert_targets)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)

    # Evaluator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
    evaluator = onmt.translate.Evaluator(model,
                                         fields,
                                         scorer,
                                         copy_attn=model_opt.copy_attn,
                                         cuda=opt.cuda)

    # Statistics
    #counter = count(1)
    #score_total, words_total = 0, 0

    for batch in data_iter:
        scores = evaluator.evaluate_batch(batch, data)
        for score in scores:
            out_file.write(str(score))
            out_file.write('\n')
            out_file.flush()
Beispiel #5
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    opts.train_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    engine = DBEngine(opt.db_file)

    with codecs.open(opt.source_file, "r", "utf-8") as corpus_file:
        sql_list = [json.loads(line)['sql'] for line in corpus_file]

    js_list = table.IO.read_anno_json(opt.anno)

    prev_best = (None, None)
    for fn_model in glob.glob(opt.model_path):

        opt.model = fn_model

        translator = Translator(opt, dummy_opt.__dict__)
        data = table.IO.TableDataset(js_list, translator.fields, None, False)
        test_data = table.IO.OrderedIterator(dataset=data,
                                             device=opt.gpu,
                                             batch_size=opt.batch_size,
                                             train=False,
                                             sort=True,
                                             sort_within_batch=False)

        # inference
        if opt.beam_search:
            print('Using execution guidance for inference.')
        r_list = []

        for batch in test_data:
            r_list += translator.translate(batch, js_list, sql_list)

        r_list.sort(key=lambda x: x.idx)

        assert len(r_list) == len(
            js_list), 'len(r_list) != len(js_list): {} != {}'.format(
                len(r_list), len(js_list))

        # evaluation
        for pred, gold, sql_gold in zip(r_list, js_list, sql_list):
            pred.eval(gold, sql_gold, engine)
        print('Results:')
        for metric_name in ('all', 'exe'):
            c_correct = sum((x.correct[metric_name] for x in r_list))
            print('{}: {} / {} = {:.2%}'.format(metric_name, c_correct,
                                                len(r_list),
                                                c_correct / len(r_list)))
            if metric_name == 'all' and (prev_best[0] is None
                                         or c_correct > prev_best[1]):
                prev_best = (fn_model, c_correct)

    if (opt.split == 'dev') and (prev_best[0] is not None):
        with codecs.open(os.path.join(opt.data_path, 'dev_best.txt'),
                         'w',
                         encoding='utf-8') as f_out:
            f_out.write('{}\n'.format(prev_best[0]))
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(parser)
    opts.train_opts(parser)
    opts.data_opts(parser)
    opts.score_opts(parser)
    add_md_help_argument(parser)
    options = parser.parse_args()
Beispiel #7
0
Datei: svr.py Projekt: marcwww/LL
def load_opt():
    parser = argparse. \
        ArgumentParser(description='main.py',
                       formatter_class=argparse.
                       ArgumentDefaultsHelpFormatter)

    opts.model_opts(parser)
    opts.train_opts(parser)
    opt = parser.parse_args()
    return opt
Beispiel #8
0
def main(anno_file_name, col_headers, raw_args=None, verbose=True):
    parser = argparse.ArgumentParser(description='evaluate.py')
    opts.translate_opts(parser)
    opt = parser.parse_args(raw_args)
    torch.cuda.set_device(opt.gpu)
    opt.db_file = os.path.join(opt.data_path, '{}.db'.format(opt.split))
    opt.pre_word_vecs = os.path.join(opt.data_path, 'embedding')
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    opts.train_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]
    opt.anno = anno_file_name

    engine = DBEngine(opt.db_file)

    js_list = table.IO.read_anno_json(opt.anno)

    prev_best = (None, None)
    sql_query = []
    for fn_model in glob.glob(opt.model_path):

        opt.model = fn_model

        translator = Translator(opt, dummy_opt.__dict__)
        data = table.IO.TableDataset(js_list, translator.fields, None, False)
        test_data = table.IO.OrderedIterator(dataset=data,
                                             device=opt.gpu,
                                             batch_size=opt.batch_size,
                                             train=False,
                                             sort=True,
                                             sort_within_batch=False)

        # inference
        r_list = []
        for batch in test_data:
            r_list += translator.translate(batch)
        r_list.sort(key=lambda x: x.idx)
        pred = r_list[-1]
        sql_pred = {
            'agg': pred.agg,
            'sel': pred.sel,
            'conds': pred.recover_cond_to_gloss(js_list[-1])
        }
        if verbose:
            print('\n sql_pred: ', sql_pred, '\n')
            print('\n col_headers: ', col_headers, '\n')
        sql_query = Query(sql_pred['sel'], sql_pred['agg'], sql_pred['conds'])
        try:
            ans_pred = engine.execute_query(js_list[-1]['table_id'],
                                            Query.from_dict(sql_pred),
                                            lower=True,
                                            verbose=verbose)
        except Exception as e:
            ans_pred = None
    return sql_query.get_complete_query(col_headers), ans_pred
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    opts.train_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    js_list = table.IO.read_anno_json(opt.anno, opt)

    metric_name_list = ['tgt']
    prev_best = (None, None)
    for fn_model in glob.glob(opt.model_path):
        opt.model = fn_model
        print(fn_model)
        print(opt.anno)

        translator = table.Translator(opt, dummy_opt.__dict__)
        data = table.IO.TableDataset(js_list, translator.fields, 0, None,
                                     False)
        test_data = table.IO.OrderedIterator(dataset=data,
                                             device=opt.gpu,
                                             batch_size=opt.batch_size,
                                             train=False,
                                             sort=True,
                                             sort_within_batch=False)

        # inference
        r_list = []
        for batch in test_data:
            r = translator.translate(batch)
            r_list += r
        r_list.sort(key=lambda x: x.idx)
        assert len(r_list) == len(
            js_list), 'len(r_list) != len(js_list): {} != {}'.format(
                len(r_list), len(js_list))

        # evaluation
        for pred, gold in zip(r_list, js_list):
            pred.eval(gold)
        print('Results:')
        for metric_name in metric_name_list:
            c_correct = sum((x.correct[metric_name] for x in r_list))
            acc = c_correct / len(r_list)
            print('{}: {} / {} = {:.2%}'.format(metric_name, c_correct,
                                                len(r_list), acc))
            if metric_name == 'tgt' and (prev_best[0] is None
                                         or acc > prev_best[1]):
                prev_best = (fn_model, acc)

    if (opt.split == 'dev') and (prev_best[0] is not None):
        with codecs.open(os.path.join(opt.root_dir, opt.dataset,
                                      'dev_best.txt'),
                         'w',
                         encoding='utf-8') as f_out:
            f_out.write('{}\n'.format(prev_best[0]))
Beispiel #10
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    opts.train_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    js_list = table.IO.read_anno_json(opt.anno, opt)
    # metric_name_list = ['tgt']
    prev_best = (None, None)
    # print(opt.model_path)
    for fn_model in glob.glob(opt.model_path):
        opt.model = fn_model
        print(fn_model)
        with torch.no_grad():
            translator = table.Translator(opt, dummy_opt.__dict__)
            data = table.IO.TableDataset(js_list, translator.fields, 0, None,
                                         False)
            test_data = table.IO.OrderedIterator(dataset=data,
                                                 device=opt.gpu,
                                                 batch_size=opt.batch_size,
                                                 train=False,
                                                 sort=True,
                                                 sort_within_batch=False)
            # inference
            r_list = []
            for batch in test_data:
                r = translator.translate(batch)
                r_list += r

        r_list.sort(key=lambda x: x.idx)
        assert len(r_list) == len(
            js_list), 'len(r_list) != len(js_list): {} != {}'.format(
                len(r_list), len(js_list))

        metric, _ = com_metric(js_list, r_list)
    if opt.split == 'test':
        ref_dic, pre_dict = effect_len(js_list, r_list)
        for i in range(len(ref_dic)):
            js_list = ref_dic[i]
            r_list = pre_dict[i]
            print("the effect of length {}".format(i))
            metric, _ = com_metric(js_list, r_list)

        if prev_best[0] is None or float(metric['Bleu_1']) > prev_best[1]:
            prev_best = (fn_model, metric['Bleu_1'])

    if (opt.split == 'dev') and (prev_best[0] is not None):
        with codecs.open(os.path.join(opt.root_dir, opt.dataset,
                                      'dev_best.txt'),
                         'w',
                         encoding='utf-8') as f_out:
            f_out.write('{}\n'.format(prev_best[0]))
Beispiel #11
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='umt.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    opts.add_md_help_argument(parser)
    opts.model_opts(parser)
    opts.preprocess_opts(parser)
    opts.train_opts(parser)

    opt = parser.parse_args()
    torch.manual_seed(opt.seed)

    if opt.word_vec_size != -1:
        opt.src_word_vec_size = opt.word_vec_size
        opt.tgt_word_vec_size = opt.word_vec_size

    if opt.layers != -1:
        opt.enc_layers = opt.layers
        opt.dec_layers = opt.layers

    opt.brnn = (opt.encoder_type == "brnn")

    # if opt.seed > 0:
    random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    if torch.cuda.is_available() and not opt.gpuid:
        print("WARNING: You have a CUDA device, should run with -gpuid 0")

    if opt.gpuid:
        cuda.set_device(opt.gpuid[0])
        if opt.seed > 0:
            torch.cuda.manual_seed(opt.seed)

    if len(opt.gpuid) > 1:
        sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n")
        sys.exit(1)

    # Set up the Crayon logging server.
    if opt.exp_host != "":
        from pycrayon import CrayonClient

        cc = CrayonClient(hostname=opt.exp_host)

        experiments = cc.get_experiment_names()
        print(experiments)
        if opt.exp in experiments:
            cc.remove_experiment(opt.exp)

    return opt
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(parser)
    opts.train_opts(parser)
    opts.data_opts(parser)
    opts.score_opts(parser)
    options = parser.parse_args()

    print(options)

    argfile = options.save_model + '_arg.p'

    print('Saving arguments in ' + argfile)
    pickle.dump(options, open(argfile, "wb"))

    train(options)
Beispiel #13
0
    def __init__(
            self,
            modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_97.30_ppl_1.41_e7.pt',
            dynamic_dict=True,
            attn_debug=True,
            share_vocab=True,
            replace_unk=True,
            verbose=True):
        #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_78.18_ppl_9.60_e4.pt'):
        #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_82.37_ppl_6.28_e8.pt'):
        #def __init__(self, modelfile='/data1/data1/Anirban/structure2text/model_softmax_1_acc_84.10_ppl_2.13_e1.pt'):
        print('Loading ' + modelfile)
        parser = argparse.ArgumentParser(
            description='seq2seq_predict',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)
        #opt = parser.parse_args()
        opt, unknown = parser.parse_known_args()
        print('Unknown arguments ', unknown)
        opt.dynamic_dict = dynamic_dict
        opt.attn_debug = attn_debug
        opt.share_vocab = share_vocab
        opt.replace_unk = replace_unk
        opt.verbose = verbose
        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]

        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)

        opt.src = 'temp_seq2seq_pred_%f.txt' % time.time()
        opt.model = modelfile

        print('Loading seq2seq model...')
        # Load the model.
        fields, model, model_opt = \
            onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

        self.opt = opt
        self.fields = fields
        self.model = model
        self.model_opt = model_opt
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Add in default model arguments, possibly added since training.
    checkpoint = torch.load(opt.model,
                            map_location=lambda storage, loc: storage)
    model_opt = checkpoint['opt']
    src_dict = checkpoint['vocab'][1][1]
    tgt_dict = checkpoint['vocab'][0][1]

    fields = onmt.IO.load_fields(checkpoint['vocab'])

    model_opt = checkpoint['opt']
    for arg in dummy_opt.__dict__:
        if arg not in model_opt:
            model_opt.__dict__[arg] = dummy_opt.__dict__[arg]

    model = onmt.ModelConstructor.make_base_model(model_opt, fields,
                                                  use_gpu(opt), checkpoint)
    encoder = model.encoder
    decoder = model.decoder

    encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist()
    decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist()

    print("Writing source embeddings")
    write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict,
                     encoder_embeddings)

    print("Writing target embeddings")
    write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict,
                     decoder_embeddings)

    print('... done.')
    print('Converting model...')
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Add in default model arguments, possibly added since training.
    checkpoint = torch.load(opt.model,
                            map_location=lambda storage, loc: storage)
    model_opt = checkpoint['opt']
    src_dict = checkpoint['vocab'][1][1]
    tgt_dict = checkpoint['vocab'][0][1]

    fields = onmt.io.load_fields_from_vocab(checkpoint['vocab'])

    model_opt = checkpoint['opt']
    for arg in dummy_opt.__dict__:
        if arg not in model_opt:
            model_opt.__dict__[arg] = dummy_opt.__dict__[arg]

    model = onmt.ModelConstructor.make_base_model(
                            model_opt, fields, use_gpu(opt), checkpoint)
    encoder = model.encoder
    decoder = model.decoder

    encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist()
    decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist()

    print("Writing source embeddings")
    write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict,
                     encoder_embeddings)

    print("Writing target embeddings")
    write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict,
                     decoder_embeddings)

    print('... done.')
    print('Converting model...')
Beispiel #16
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # N = 1
    # M = 1
    # src_file = open(opt.train_src, 'r')
    # trg_file = open(opt.train_tgt, 'r')
    #
    # align_file = open(opt.train_align, 'r')
    #
    # src_lines = src_file.readlines()
    # trg_lines = trg_file.readlines()
    # align_lines = align_file.readlines()
    #
    # align = OrderedDict()
    # pool = Pool()
    # result = []
    # for i in range(10000):
    #     result.append(pool.apply_async(func, args=(
    #     i, N, M, src_lines[125 * i:125 * (i + 1)], trg_lines[125 * i:125 * (i + 1)],
    #     align_lines[125 * i:125 * (i + 1)])))
    # pool.close()
    # pool.join()
    #
    # for i in result:
    #     ddict = i.get()
    #     for k, v in ddict.items():
    #         if k not in align:
    #             align[k] = v
    #         else:
    #             align[k] = v + align[k]
    #
    # align_sorted = sorted(align.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    # print(len(align_sorted))
    #
    # k = 0
    # lists = OrderedDict()
    # for align in align_sorted:
    #     pairs = eval(align[0])
    #     y = pairs[0]
    #     ngram = pairs[2]
    #     context = pairs[3]
    #     if align[1] > 0:
    #         if str(ngram) not in lists:
    #             value = OrderedDict()
    #             xy = OrderedDict()
    #             xy[y] = align[1]
    #             value[str(context)] = xy
    #             lists[str(ngram)] = value
    #         else:
    #             if str(context) not in lists[str(ngram)]:
    #                 xy = OrderedDict()
    #                 xy[y] = align[1]
    #                 lists[str(ngram)][str(context)] = xy
    #             else:
    #                 if y not in lists[str(ngram)][str(context)]:
    #                     lists[str(ngram)][str(context)][y] = align[1]
    #                 else:
    #                     lists[str(ngram)][str(context)][y] = align[1] + lists[str(ngram)][str(context)][y]
    #         k += 1
    # print(k)
    pkl_file = open(opt.lists, 'rb')
    lists = pickle.load(pkl_file)
    pkl_file.close()

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(fields,
                                 opt.data_type,
                                 opt.src,
                                 opt.tgt,
                                 src_dir=opt.src_dir,
                                 sample_rate=opt.sample_rate,
                                 window_size=opt.window_size,
                                 window_stride=opt.window_stride,
                                 window=opt.window,
                                 use_filter_pred=False)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.TestOrderedIterator(dataset=data,
                                            device=opt.gpu,
                                            batch_size=opt.batch_size,
                                            train=False,
                                            sort=False,
                                            sort_within_batch=True,
                                            shuffle=False)

    # Translator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta,
                                             opt.coverage_penalty,
                                             opt.length_penalty)
    translator = onmt.translate.Translator(
        model,
        fields,
        beam_size=opt.beam_size,
        n_best=opt.n_best,
        global_scorer=scorer,
        max_length=opt.max_length,
        copy_attn=model_opt.copy_attn,
        cuda=opt.cuda,
        beam_trace=opt.dump_beam != "",
        min_length=opt.min_length,
        stepwise_penalty=opt.stepwise_penalty,
        block_ngram_repeat=opt.block_ngram_repeat,
        ignore_when_blocking=opt.ignore_when_blocking)
    builder = onmt.translate.TranslationBuilder(data, translator.fields,
                                                opt.n_best, opt.replace_unk,
                                                opt.tgt)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0

    for batch, word_batch in data_iter:
        batch_data = translator.translate_batch(batch, word_batch, data, lists)
        translations = builder.from_batch(batch_data)

        for trans in translations:
            pred_score_total += trans.pred_scores[0]
            pred_words_total += len(trans.pred_sents[0])
            if opt.tgt:
                gold_score_total += trans.gold_score
                gold_words_total += len(trans.gold_sent) + 1

            n_best_preds = [
                " ".join(pred) for pred in trans.pred_sents[:opt.n_best]
            ]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

    _report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        _report_score('GOLD', gold_score_total, gold_words_total)
        if opt.report_bleu:
            _report_bleu()
        if opt.report_rouge:
            _report_rouge()

    if opt.dump_beam:
        import json
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #17
0
    return predictions


@app.route('/translate', methods=['POST'])
def config():
    req = request.get_json()
    res = []
    for s in req:
        res.append(translate(s))
    return jsonify(sum(res, []))


if __name__ == '__main__':
    opt = parser.parse_args()

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    for model in opt.model:
        print("Loading model... " + model)
        modelopt = copy.copy(opt)
        modelopt.model = model
        hash = hash_byname(model)
        translators[hash] = onmt.Translator(modelopt, dummy_opt.__dict__)

    app.run(debug=False,  host='0.0.0.0', port=8092)
Beispiel #18
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__, stage1=True)

    model2 = None
    if opt.model2 is not None:
        fields2, model2, model_opt2 = \
            onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__, stage1=False)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(fields,
                                 opt.data_type,
                                 opt.src1,
                                 opt.tgt1,
                                 opt.src2,
                                 opt.tgt2,
                                 src_dir=opt.src_dir,
                                 sample_rate=opt.sample_rate,
                                 window_size=opt.window_size,
                                 window_stride=opt.window_stride,
                                 window=opt.window,
                                 use_filter_pred=False)

    def sort_minibatch_key(ex):
        """ Sort using length of source sentences and length of target sentence """
        #Needed for packed sequence
        if hasattr(ex, "tgt1"):
            return len(ex.src1), len(ex.tgt1)
        return len(ex.src1)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        sort_key=sort_minibatch_key,
                                        sort_within_batch=True,
                                        shuffle=False)

    # Translator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta,
                                             opt.coverage_penalty,
                                             opt.length_penalty)
    tgt_plan_map = None

    if opt.src2 is None:
        tgt_plan_map = {}
        for j, entry in enumerate(fields["tgt1"].vocab.itos):
            if j < 4:
                tgt_plan_map[j] = j
            else:
                tgt_plan_map[j] = int(entry)
    translator = onmt.translate.Translator(
        model,
        model2,
        fields,
        beam_size=opt.beam_size,
        n_best=opt.n_best,
        global_scorer=scorer,
        max_length=opt.max_length,
        copy_attn=model_opt.copy_attn and tgt_plan_map is None,
        cuda=opt.cuda,
        beam_trace=opt.dump_beam != "",
        min_length=opt.min_length,
        stepwise_penalty=opt.stepwise_penalty)
    builder = onmt.translate.TranslationBuilder(data,
                                                translator.fields,
                                                opt.n_best,
                                                opt.replace_unk,
                                                has_tgt=False)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0
    stage1 = opt.stage1
    for batch in tqdm(data_iter):
        # NOTE
        batch_data = translator.translate_batch(batch, data, stage1)
        translations = builder.from_batch(batch_data, stage1)

        for trans in translations:
            pred_score_total += trans.pred_scores[0]
            pred_words_total += len(trans.pred_sents[0])
            if opt.tgt2:
                gold_score_total += trans.gold_score
                gold_words_total += len(trans.gold_sent)

            if stage1:
                n_best_preds = [
                    " ".join([str(entry) for entry in pred])
                    for pred in trans.pred_sents[:opt.n_best]
                ]
            else:
                n_best_preds = [
                    " ".join(pred) for pred in trans.pred_sents[:opt.n_best]
                ]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

    _report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt2:
        _report_score('GOLD', gold_score_total, gold_words_total)
        if opt.report_bleu:
            _report_bleu()
        if opt.report_rouge:
            _report_rouge()

    if opt.dump_beam:
        import json
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #19
0
def main():
    previous_words = None
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]
    print('dummy_opt: ', dummy_opt)

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    translator = onmt.Translator(opt, dummy_opt.__dict__)
    out_file = codecs.open(opt.output, 'w', 'utf-8')
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()
    data = onmt.IO.ONMTDataset(opt.src,
                               opt.tgt,
                               translator.fields,
                               use_filter_pred=False)

    test_data = onmt.IO.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        shuffle=False)

    counter = count(1)
    for batch in test_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src \
            = translator.translate(batch, data)
        pred_score_total += sum(score[0] for score in pred_scores)
        pred_words_total += sum(len(x[0]) for x in pred_batch)
        if opt.tgt:
            gold_score_total += sum(gold_scores)
            gold_words_total += sum(len(x) for x in batch.tgt[1:])

        #davidstap
        #_, src_lengths = batch.src
        #encStates, context = translator.model.encoder(src, src_lengths)

        # z_batch: an iterator over the predictions, their scores,
        # the gold sentence, its score, and the source sentence for each
        # sentence in the batch. It has to be zip_longest instead of
        # plain-old zip because the gold_batch has length 0 if the target
        # is not included.
        z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores,
                              (sent.squeeze(1)
                               for sent in src.split(1, dim=1)))

        for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch:
            # src_sent is torch.LongTensor
            #print('type src_sent:',type(src_sent))
            n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                words = get_src_words(src_sent,
                                      translator.fields["src"].vocab.itos)

                if previous_words is not None:

                    print('BLEU: ', sentence_bleu([words], previous_words))
                    print()
                    print('S1:', words)
                    print('S2:', previous_words)

                #os.write(1, bytes('\nSENT %d: %s\n' %
            #                      (sent_number, words), 'UTF-8'))

                previous_words = words

                best_pred = n_best_preds[0]

                #TODO: calculate BLEU score reference (best_pred) and hypothesis (words)
                #TODO: calculate cosine_similarity (best_pred) and hypothesis (words)
                #bleu_score = sentence_bleu(best_pred, words)
                #print('BLEU: ',bleu_score)

                best_score = pred_score[0]
                #os.write(1, bytes('PRED %d: %s\n' %
                #                      (sent_number, best_pred), 'UTF-8'))
                #print("PRED SCORE: %.4f" % best_score)

                # 'words' = input sentence
                # 'best_pred' = prediction

                # put source sentence in translator.model.encoder to find context
                # maybe change data type src? torchtext datatype?

                #model = NMTModel(encoder, decoder) (see ModelConstructor)
                src_lengths = len(words.split())

                # src(FloatTensor): a sequence of source tensors with
                #         optional feature tensors of size (len x batch).
                # tgt(FloatTensor): a sequence of target tensors with
                #         optional feature tensors of size (len x batch).
                # lengths([int]): an array of the src length.
                # dec_state: A decoder state object

                #hidden, context = translator.model.encoder(src_sent, src_lengths)

                #euc_dist(context_r, context_pred)

                if opt.tgt:
                    tgt_sent = ' '.join(gold_sent)
                    os.write(
                        1,
                        bytes('GOLD %d: %s\n' % (sent_number, tgt_sent),
                              'UTF-8'))
                    print("GOLD SCORE: %.4f" % gold_score)

                if len(n_best_preds) > 1:
                    print('\nBEST HYP:')
                    for score, sent in zip(pred_score, n_best_preds):
                        os.write(1,
                                 bytes("[%.4f] %s\n" % (score, sent), 'UTF-8'))

    report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        report_score('GOLD', gold_score_total, gold_words_total)

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #20
0
def main():
    opt = parser.parse_args()

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    translator = onmt.Translator(opt, dummy_opt.__dict__)
    outF = codecs.open(opt.output, 'w', 'utf-8')
    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0
    srcBatch, tgtBatch = [], []
    count = 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()

    data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, None)

    testData = onmt.IO.OrderedIterator(
        dataset=data, device=opt.gpu,
        batch_size=opt.batch_size, train=False, sort=False,
        shuffle=False)

    index = 0
    for batch in testData:
        predBatch, predScore, goldScore, attn, src \
            = translator.translate(batch, data)
        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if opt.tgt:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)

        for b in range(len(predBatch)):
            count += 1
            try:
                # python2
                outF.write(" ".join([i.decode('utf-8')
                           for i in predBatch[b][0]]) + '\n')
            except AttributeError:
                # python3: can't do .decode on a str object
                outF.write(" ".join(predBatch[b][0]) + '\n')
            outF.flush()

            if opt.verbose:
                words = []
                for f in src[:, b]:
                    word = translator.fields["src"].vocab.itos[f]
                    if word == onmt.IO.PAD_WORD:
                        break
                    words.append(word)

                os.write(1, bytes('SENT %d: %s\n' %
                                  (count, " ".join(words)), 'UTF-8'))

                index += 1
                print(len(predBatch[b][0]))
                os.write(1, bytes('\n PRED %d: %s\n' %
                                  (count, " ".join(predBatch[b][0])), 'UTF-8'))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if opt.tgt:
                    tgtSent = ' '.join(tgtBatch[b])
                    os.write(1, bytes('GOLD %d: %s\n' %
                             (count, tgtSent), 'UTF-8'))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        os.write(1, bytes("[%.4f] %s\n" % (predScore[b][n],
                                 " ".join(predBatch[b][n])),
                            'UTF-8'))

                if opt.attn_debug:
                    print('')
                    for i, w in enumerate(predBatch[b][0]):
                        print(w)
                        _, ids = attn[b][0][i].sort(0, descending=True)
                        for j in ids[:5].tolist():
                            print("\t%s\t%d\t%3f" % (srcBatch[b][j], j,
                                                     attn[b][0][i][j]))

        srcBatch, tgtBatch = [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if opt.tgt:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #21
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(fields, opt.data_type,
                                 opt.src, opt.tgt,
                                 src_dir=opt.src_dir,
                                 sample_rate=opt.sample_rate,
                                 window_size=opt.window_size,
                                 window_stride=opt.window_stride,
                                 window=opt.window,
                                 use_filter_pred=False)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.OrderedIterator(
        dataset=data, device=opt.gpu,
        batch_size=opt.batch_size, train=False, sort=False,
        sort_within_batch=True, shuffle=False)

    # Translator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha,
                                             opt.beta,
                                             opt.coverage_penalty,
                                             opt.length_penalty)
    translator = onmt.translate.Translator(
        model, fields,
        beam_size=opt.beam_size,
        n_best=opt.n_best,
        global_scorer=scorer,
        max_length=opt.max_length,
        copy_attn=model_opt.copy_attn,
        cuda=opt.cuda,
        beam_trace=opt.dump_beam != "",
        min_length=opt.min_length,
        stepwise_penalty=opt.stepwise_penalty)
    builder = onmt.translate.TranslationBuilder(
        data, translator.fields,
        opt.n_best, opt.replace_unk, opt.tgt)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0

    for batch in data_iter:
        batch_data = translator.translate_batch(batch, data)
        translations = builder.from_batch(batch_data)

        for trans in translations:
            pred_score_total += trans.pred_scores[0]
            pred_words_total += len(trans.pred_sents[0])
            if opt.tgt:
                gold_score_total += trans.gold_score
                gold_words_total += len(trans.gold_sent) + 1

            n_best_preds = [" ".join(pred)
                            for pred in trans.pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

    _report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        _report_score('GOLD', gold_score_total, gold_words_total)
        if opt.report_bleu:
            _report_bleu()
        if opt.report_rouge:
            _report_rouge()

    if opt.dump_beam:
        import json
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #22
0
import opts

import argparse
import glob

print torch.cuda.is_available()
print cuda.device_count()
print cuda.current_device()

parser = argparse.ArgumentParser(
    description='train.py',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# opts.py
opts.add_md_help_argument(parser)
opts.model_opts(parser)
opts.train_opts(parser)

opt = parser.parse_args()
if opt.word_vec_size != -1:
    opt.src_word_vec_size = opt.word_vec_size
    opt.tgt_word_vec_size = opt.word_vec_size

if opt.layers != -1:
    opt.enc_layers = opt.layers
    opt.dec_layers = opt.layers

opt.brnn = (opt.encoder_type == "brnn")
if opt.seed > 0:
    random.seed(opt.seed)
    torch.manual_seed(opt.seed)
Beispiel #23
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    translator = onmt.Translator(opt, dummy_opt.__dict__)
    out_file = codecs.open(opt.output, 'w', 'utf-8')
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()
    data = onmt.IO.ONMTDataset(opt.src,
                               opt.tgt,
                               translator.fields,
                               use_filter_pred=False)

    test_data = onmt.IO.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        shuffle=False)

    counter = count(1)
    for batch in test_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src \
            = translator.translate(batch, data)
        pred_score_total += sum(score[0] for score in pred_scores)
        pred_words_total += sum(len(x[0]) for x in pred_batch)
        if opt.tgt:
            gold_score_total += sum(gold_scores)
            gold_words_total += sum(len(x) for x in batch.tgt[1:])

        # z_batch: an iterator over the predictions, their scores,
        # the gold sentence, its score, and the source sentence for each
        # sentence in the batch. It has to be zip_longest instead of
        # plain-old zip because the gold_batch has length 0 if the target
        # is not included.
        z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores,
                              (sent.squeeze(1)
                               for sent in src.split(1, dim=1)))

        for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch:
            n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                words = get_src_words(src_sent,
                                      translator.fields["src"].vocab.itos)

                os.write(
                    1, bytes('\nSENT %d: %s\n' % (sent_number, words),
                             'UTF-8'))

                best_pred = n_best_preds[0]
                best_score = pred_score[0]
                os.write(
                    1,
                    bytes('PRED %d: %s\n' % (sent_number, best_pred), 'UTF-8'))
                print("PRED SCORE: %.4f" % best_score)

                if opt.tgt:
                    tgt_sent = ' '.join(gold_sent)
                    os.write(
                        1,
                        bytes('GOLD %d: %s\n' % (sent_number, tgt_sent),
                              'UTF-8'))
                    print("GOLD SCORE: %.4f" % gold_score)

                if len(n_best_preds) > 1:
                    print('\nBEST HYP:')
                    for score, sent in zip(pred_score, n_best_preds):
                        os.write(1,
                                 bytes("[%.4f] %s\n" % (score, sent), 'UTF-8'))

    report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        report_score('GOLD', gold_score_total, gold_words_total)

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(fields,
                                 opt.data_type,
                                 opt.src,
                                 opt.tgt,
                                 src_dir=opt.src_dir,
                                 sample_rate=opt.sample_rate,
                                 window_size=opt.window_size,
                                 window_stride=opt.window_stride,
                                 window=opt.window,
                                 use_filter_pred=False)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)

    # Translator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
    translator = onmt.translate.Translator(
        model,
        fields,
        beam_size=opt.beam_size,
        n_best=opt.n_best,
        global_scorer=scorer,
        max_length=opt.max_length,
        copy_attn=model_opt.copy_attn,
        cuda=opt.cuda,
        beam_trace=opt.dump_beam != "",
        dump_beam=opt.dump_beam,
        min_length=opt.min_length,
        antilm_lambda=opt.antilm_lambda,
        antilm_eta=opt.antilm_eta,
        antilm_equal_src=opt.antilm_equal_src,
        lambda_ADBS=opt.lambda_ADBS,
        affective_decoding=opt.affective_decoding,
        k=opt.k,
        sort_AS=opt.sort_AS,
        sort_similarity=opt.sort_similarity,
        penalize_repeats=opt.penalize_repeats)
    builder = onmt.translate.TranslationBuilder(data, translator.fields,
                                                opt.n_best, opt.replace_unk,
                                                opt.tgt)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0
    predictions = []

    # Test word embedding
    # print(model.decoder.embeddings.word_lut.weight.data[200:220, -10:])
    # print(model.decoder.embeddings.embedding_copy.weight.data[200:220, -10:])
    # Load adj vocab
    pretrained_adj = None
    if opt.adj_vocab:
        print("Loading adj vocab...")
        pretrained_adj = torch.load(opt.adj_vocab)

    word_freq = None
    if opt.weighted_AS:
        print("Loading unigram frequency...")
        with open(opt.weighted_AS, "rb") as f:
            word_freq = np.array(pickle.load(f))

    # Load word embedding matrix and VAD embedding matrix, pass them to translate_batch()
    if opt.save_attn:
        pred_ids = []
        attns = []
        indices = []

    for batch in data_iter:
        batch_data = translator.translate_batch(batch, data)

        if opt.save_attn:
            pred_ids.append(batch_data["predictions"])
            attns.append(batch_data["attention"])
            indices.append(batch_data["batch"].indices)

        # Rerank beams
        if opt.rerank:
            batch_data = rerank(model, batch_data)
        predictions += batch_data["predictions"]
        translations = builder.from_batch(batch_data)

        for trans in translations:
            pred_score_total += trans.pred_scores[0]
            pred_words_total += len(trans.pred_sents[0])
            if opt.tgt:
                gold_score_total += trans.gold_score
                gold_words_total += len(trans.gold_sent)

            n_best_preds = [
                " ".join(pred) for pred in trans.pred_sents[:opt.n_best]
            ]
            if opt.display_1:
                n_best_preds = [
                    " ".join(pred) for pred in trans.pred_sents[:1]
                ]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()
            if opt.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

    if opt.save_attn:
        with open(opt.save_attn + ".pkl", "wb") as f:
            pickle.dump(attns, f)
        with open(opt.save_attn + "_predictions" + ".pkl", "wb") as f:
            pickle.dump(pred_ids, f)
        with open(opt.save_attn + "_indices" + ".pkl", "wb") as f:
            pickle.dump(indices, f)

    pred_score = _report_score('PRED', pred_score_total, pred_words_total)
    out_file.write(pred_score + "\n")
    out_file.flush()

    # Evaluate predictions here
    metrics = evaluate_predictions(model, pretrained_adj, predictions,
                                   word_freq)
    out_file.write(metrics + "\n")
    out_file.flush()

    if opt.tgt:
        gold_score = _report_score('GOLD', gold_score_total, gold_words_total)
        if opt.report_bleu:
            _report_bleu()
        if opt.report_rouge:
            _report_rouge()
        out_file.write(gold_score + "\n")
    out_file.flush()

    if opt.dump_beam:
        import json
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #25
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(fields,
                                 opt.src_data_type,
                                 opt.tgt_data_type,
                                 opt.src,
                                 opt.tgt,
                                 src_dir=opt.src_dir,
                                 sample_rate=opt.sample_rate,
                                 window_size=opt.window_size,
                                 window_stride=opt.window_stride,
                                 window=opt.window,
                                 use_filter_pred=False)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    if opt.src_data_type == "trigrams" and opt.tgt_data_type == "words":
        data_iter = onmt.io.IO.SourceTrigramsOrderedIterator(
            dataset=data,
            device=opt.gpu,
            batch_size=opt.batch_size,
            train=False,
            sort=False,
            sort_within_batch=True,
            shuffle=False)
    elif opt.src_data_type == "words" and opt.tgt_data_type == "characters":
        data_iter = onmt.io.IO.TargetCharactersOrderedIterator(
            dataset=data,
            device=opt.gpu,
            batch_size=opt.batch_size,
            train=False,
            sort=False,
            sort_within_batch=True,
            shuffle=False)
    elif opt.src_data_type == "trigrams" and opt.tgt_data_type == "characters":
        data_iter = onmt.io.IO.BothCharactersOrderedIterator(
            dataset=data,
            device=opt.gpu,
            batch_size=opt.batch_size,
            train=False,
            sort=False,
            sort_within_batch=True,
            shuffle=False)
    else:
        data_iter = onmt.io.OrderedIterator(dataset=data,
                                            device=opt.gpu,
                                            batch_size=opt.batch_size,
                                            train=False,
                                            sort=False,
                                            sort_within_batch=True,
                                            shuffle=False)

    # Translator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
    translator = onmt.translate.Translator(model,
                                           fields,
                                           beam_size=opt.beam_size,
                                           n_best=opt.n_best,
                                           global_scorer=scorer,
                                           max_length=opt.max_length,
                                           copy_attn=model_opt.copy_attn,
                                           cuda=opt.cuda,
                                           beam_trace=opt.dump_beam != "",
                                           min_length=opt.min_length)
    builder = onmt.translate.TranslationBuilder(data, translator.fields,
                                                opt.n_best, opt.replace_unk,
                                                opt.tgt)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0

    for batch in data_iter:
        if opt.tgt_data_type == 'words':
            batch_data = translator.translate_batch(batch, data)
        else:
            batch_data = translator.beam_translate(batch, data)
            #batch_data = translator.greedy_translate(batch, data)

        translations = builder.from_batch(batch_data)

        for trans in translations:
            pred_score_total += trans.pred_scores[0]
            if trans.pred_sents == []:
                trans.pred_sents = [' ']
            pred_words_total += len(trans.pred_sents[0])
            if opt.tgt:
                gold_score_total += trans.gold_score
                gold_words_total += len(trans.gold_sent)

            if opt.tgt_data_type == 'characters':
                n_best_preds = [''.join(pred) for pred in trans.pred_sents]
                output = []
                for w in n_best_preds:
                    if w == '$$':
                        output.append(" ")
                    else:
                        #output.append(w[1])
                        output.append(w)

                out_file.write(''.join(output))
            else:
                n_best_preds = [
                    " ".join(pred) for pred in trans.pred_sents[:opt.n_best]
                ]
                out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

    _report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        _report_score('GOLD', gold_score_total, gold_words_total)
        if opt.report_bleu:
            _report_bleu()
        if opt.report_rouge:
            _report_rouge()

    if opt.dump_beam:
        import json
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #26
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train_mm.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # loading checkpoint just to find multimodal model type
    checkpoint = torch.load(opt.model,
                            map_location=lambda storage, loc: storage)
    opt.multimodal_model_type = checkpoint['opt'].multimodal_model_type
    del checkpoint

    if opt.batch_size > 1:
        print "Batch size > 1 not implemented! Falling back to batch_size = 1 ..."
        opt.batch_size = 1

    # load test image features
    test_file = tables.open_file(opt.path_to_test_img_feats, mode='r')
    if opt.multimodal_model_type in ['imgd', 'imge', 'imgw']:
        test_img_feats = test_file.root.global_feats[:]
    elif opt.multimodal_model_type in ['src+img']:
        test_img_feats = test_file.root.local_feats[:]
    else:
        raise Exception("Model type not implemented: %s" %
                        opt.multimodal_model_type)
    test_file.close()

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)
    #opt.multimodal_model_type = checkpoint['opt'].multimodal_model_type

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(fields,
                                 opt.data_type,
                                 opt.src,
                                 opt.tgt,
                                 src_dir=opt.src_dir,
                                 sample_rate=opt.sample_rate,
                                 window_size=opt.window_size,
                                 window_stride=opt.window_stride,
                                 window=opt.window,
                                 use_filter_pred=False)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)

    # Translator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
    translator = onmt.translate.TranslatorMultimodal(
        model,
        fields,
        beam_size=opt.beam_size,
        n_best=opt.n_best,
        global_scorer=scorer,
        max_length=opt.max_length,
        copy_attn=model_opt.copy_attn,
        cuda=opt.cuda,
        beam_trace=opt.dump_beam != "",
        min_length=opt.min_length,
        test_img_feats=test_img_feats,
        multimodal_model_type=opt.multimodal_model_type)
    builder = onmt.translate.TranslationBuilder(data, translator.fields,
                                                opt.n_best, opt.replace_unk,
                                                opt.tgt)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0

    for sent_idx, batch in enumerate(data_iter):
        batch_data = translator.translate_batch(batch, data, sent_idx)
        translations = builder.from_batch(batch_data)

        for trans in translations:
            pred_score_total += trans.pred_scores[0]
            pred_words_total += len(trans.pred_sents[0])
            if opt.tgt:
                gold_score_total += trans.gold_score
                gold_words_total += len(trans.gold_sent)

            n_best_preds = [
                " ".join(pred) for pred in trans.pred_sents[:opt.n_best]
            ]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

    _report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        _report_score('GOLD', gold_score_total, gold_words_total)
        if opt.report_bleu:
            _report_bleu()
        if opt.report_rouge:
            _report_rouge()

    if opt.dump_beam:
        import json
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main():
    opt = parser.parse_args()

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    #Creates the translator!!!
    translator = onmt.Translator(opt, dummy_opt.__dict__)
    outF = codecs.open(opt.output, 'w', 'utf-8')
    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0
    count = 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()

    #Process the data for the test
    data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, None, opt.inter)
    testData = onmt.IO.OrderedIterator(
        dataset=data, device=opt.gpu,
        batch_size=opt.batch_size, train=False, sort=False,
        shuffle=False)

    if opt.inter != None:
        inter_act=True
    else:
        inter_act=False

    index = 0
    for batch in testData:
        #I will need to change the translator!
        predBatch, goldBatch, predScore, goldScore, attn, src \
            = translator.translate(batch, data, inter_act)
        #print((attn[0][0]))
        #print (predBatch)
        if opt.save_attention:
            attn_numpy=attn[0][0].numpy()
        #print(attn_numpy.T.shape)
            pickle.dump(attn_numpy.T,open('attention_matrix.pkl','wb'))
        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if opt.tgt:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in batch.tgt[1:])

        for b in range(len(predBatch)):
            count += 1
            try:
                # python2 (should be the same)
                for n in range(opt.n_best):
                    outF.write(" ".join([i
                               for i in predBatch[b][n]]) + '\n')
            except AttributeError:
                # python3: can't do .decode on a str object
                for n in range(opt.n_best):
                    outF.write(" ".join(predBatch[b][n]) + '\n')
            outF.flush()

            if opt.verbose:
                words = []
                for f in src[:, b]:
                    word = translator.fields["src"].vocab.itos[f]
                    if word == onmt.IO.PAD_WORD:
                        break
                    words.append(word)

                os.write(1, bytes('\nSENT %d: %s\n' %
                                  (count, " ".join(words)), 'UTF-8'))

                index += 1
                os.write(1, bytes('PRED %d: %s\n' %
                                  (count, " ".join(predBatch[b][0])), 'UTF-8'))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if opt.tgt:
                    tgtSent = ' '.join(goldBatch[b])
                    os.write(1, bytes('GOLD %d: %s\n' %
                             (count, tgtSent), 'UTF-8'))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        os.write(1, bytes("[%.4f] %s\n" % (predScore[b][n],
                                 " ".join(predBatch[b][n])),
                            'UTF-8'))

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if opt.tgt:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #28
0
import copy
import unittest
import math

import torch
from torch.autograd import Variable

import onmt
import onmt.io
import opts
from onmt.ModelConstructor import make_embeddings, \
                            make_encoder, make_decoder
from onmt.modules import ImageEncoder, AudioEncoder

parser = argparse.ArgumentParser(description='train.py')
opts.model_opts(parser)
opts.train_opts(parser)

# -data option is required, but not used in this test, so dummy.
opt = parser.parse_known_args(['-data', 'dummy'])[0]


class TestModel(unittest.TestCase):

    def __init__(self, *args, **kwargs):
        super(TestModel, self).__init__(*args, **kwargs)
        self.opt = opt

    # Helper to generate a vocabulary

    def get_vocab(self):
Beispiel #29
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(
        fields,
        opt.data_type,
        opt.src,
        opt.tgt,
        src_dir=opt.src_dir,
        sample_rate=opt.sample_rate,
        window_size=opt.window_size,
        window_stride=opt.window_stride,
        window=opt.window,
        use_filter_pred=False,
        symbol_representation=opt.symbol_representation,
        revert_targets=opt.revert_targets)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)

    # Translator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
    translator = onmt.translate.Translator(model,
                                           fields,
                                           beam_size=opt.beam_size,
                                           n_best=opt.n_best,
                                           global_scorer=scorer,
                                           max_length=opt.max_length,
                                           copy_attn=model_opt.copy_attn,
                                           cuda=opt.cuda,
                                           beam_trace=opt.dump_beam != "",
                                           min_length=opt.min_length)
    builder = onmt.translate.TranslationBuilder(data, translator.fields,
                                                opt.n_best, opt.replace_unk,
                                                opt.tgt)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0

    src_sequence_join_character, tgt_sequence_join_character = get_src_tgt_sequence_join_character(
        opt.symbol_representation)

    # convert number of variations that should be created for each input to int
    # default is n_best hypotheses per input
    num_variations_for_input = collections.defaultdict(lambda: opt.n_best)
    if opt.num_variations:
        for i, v in enumerate(opt.num_variations.split(",")):
            num_variations_for_input[i] = int(v)
    print "%s variations per input" % str(num_variations_for_input)

    input_counter = 0

    for batch in data_iter:
        batch_data = translator.translate_batch(batch, data)
        translations = builder.from_batch(batch_data)

        for trans in translations:
            pred_score_total += trans.pred_scores[0]
            pred_words_total += len(trans.pred_sents[0])
            n_best_pred_scores = trans.pred_scores[:opt.n_best]
            if opt.tgt:
                gold_score_total += trans.gold_score
                gold_words_total += len(trans.gold_sent)

            if opt.revert_targets:
                n_best_preds = [
                    tgt_sequence_join_character.join(reversed(pred))
                    for pred in trans.pred_sents[:opt.n_best]
                ]
            else:
                n_best_preds = [
                    tgt_sequence_join_character.join(pred)
                    for pred in trans.pred_sents[:opt.n_best]
                ]

            # provide src
            if opt.verbose:
                out_file.write(
                    "input:\t" +
                    src_sequence_join_character.join(trans.src_raw) + "\n")
                n_best_preds = [
                    "hyp %d:\t%s" % (i, pred)
                    for i, pred in enumerate(n_best_preds)
                ]

            n_best_preds = [
                pred.strip().replace("\n", " ") for pred in n_best_preds
            ]

            num_variations = num_variations_for_input[input_counter]

            # fill up if fewer hypotheses were returned by beam search than required number of variations
            if len(n_best_preds) < num_variations:
                num_preds_to_add = num_variations - len(n_best_preds)
                n_best_preds.extend(n_best_preds[i % len(n_best_preds)]
                                    for i in range(0, num_preds_to_add))
                n_best_pred_scores.extend(n_best_pred_scores[i %
                                                             len(n_best_preds)]
                                          for i in range(0, num_preds_to_add))

            if opt.stochastic:
                preds = np.random.choice(n_best_preds,
                                         num_variations,
                                         replace=False)
            else:
                preds = n_best_preds[:num_variations]

            if opt.report_individual_scores:
                preds_and_scores = []
                for (pred, pred_score) in zip(preds, n_best_pred_scores):
                    preds_and_scores.append("\t".join(
                        [pred.strip(), str(pred_score)]))
                preds = n_best_preds_and_scores

            out_file.write('\n'.join(preds) + "\n")
            if opt.n_best > 1:
                out_file.write('\n')

            if opt.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

            input_counter += 1

    _report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        _report_score('GOLD', gold_score_total, gold_words_total)
        if opt.report_bleu:
            _report_bleu()
        if opt.report_rouge:
            _report_rouge()

    if opt.dump_beam:
        import json
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #30
0
        output = []
        for rule in app.url_map.iter_rules():
            options = {}
            for arg in rule.arguments:
                options[arg] = "[{0}]".format(arg)

            methods = ','.join(rule.methods)
            url = url_for(rule.endpoint, **options)
            print(url)
            output.append(url)
        return jsonify(' '.join(output))

    CORS(app)

    http_server = WSGIServer((host, port), app)
    logger.info("Model loaded, serving deepsegment on port %d" % port)
    http_server.serve_forever()


if __name__ == '__main__':
    opt = opts.model_opts()
    config = yaml.load(open(opt.config, "r"))
    config = Namespace(**config, **vars(opt))

    device, devices_id = misc_utils.set_cuda(config)
    config.device = device

    # stdout_handler = prepare_global_logging(args.serialization_dir, args.file_friendly_logging)
    start(config, url_root=config.url_root, host=config.ip, port=config.port)
    # cleanup_global_logging(stdout_handler)
Beispiel #31
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    # Test data
    data = onmt.io.build_dataset(fields, opt.data_type,
                                 opt.src, opt.tgt,
                                 src_dir=opt.src_dir,
                                 sample_rate=opt.sample_rate,
                                 window_size=opt.window_size,
                                 window_stride=opt.window_stride,
                                 window=opt.window,
                                 use_filter_pred=False)

    test_data = onmt.io.OrderedIterator(
        dataset=data, device=opt.gpu,
        batch_size=opt.batch_size, train=False, sort=False,
        shuffle=False)

    # Translator
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
    translator = onmt.translate.Translator(model, fields,
                                           beam_size=opt.beam_size,
                                           n_best=opt.n_best,
                                           global_scorer=scorer,
                                           max_length=opt.max_sent_length,
                                           copy_attn=model_opt.copy_attn,
                                           cuda=opt.cuda,
                                           beam_trace=opt.dump_beam != "")
    builder = onmt.translate.TranslationBuilder(
        data, translator.fields,
        opt.n_best, opt.replace_unk, opt.tgt)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0

    for batch in test_data:
        batch_data = translator.translate_batch(batch, data)
        translations = builder.from_batch(batch_data)

        for trans in translations:
            pred_score_total += trans.pred_scores[0]
            pred_words_total += len(trans.pred_sents[0])
            if opt.tgt:
                gold_score_total += trans.gold_score
                gold_words_total += len(trans.gold_sent)

            n_best_preds = [" ".join(pred)
                            for pred in trans.pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

    def report_score(name, score_total, words_total):
        print("%s AVG SCORE: %.4f, %s PPL: %.4f" % (
            name, score_total / words_total,
            name, math.exp(-score_total/words_total)))

    report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        report_score('GOLD', gold_score_total, gold_words_total)

    if opt.dump_beam:
        import json
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Beispiel #32
0
def translate(src, model, output):
    parser = argparse.ArgumentParser(
        description='translate.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    opts.translate_opts(parser)

    opt = parser.parse_known_args([])[0]
    if opt.batch_size != 1:
        print("WARNING: -batch_size isn't supported currently, "
              "we set it to 1 for now!")
        opt.batch_size = 1

    opt.src = src
    opt.model = model
    opt.output = output

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt, dummy_opt.__dict__)
    out_file = codecs.open(opt.output, 'w', 'utf-8')
    gold_out_file = codecs.open("gold_" + opt.output, 'w', 'utf-8')

    #print "TRANSLATOR SOURCE VOCAB"
    #for i in range(len(translator.fields["src"].vocab.itos)):
    #    print i, translator.fields["src"].vocab.itos[i]
    #print

    data = onmt.IO.ONMTDataset(opt.src,
                               opt.tgt,
                               translator.fields,
                               use_filter_pred=False)

    test_data = onmt.IO.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        shuffle=False)

    counter = count(1)
    for batch in test_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src \
            = translator.translate(batch, data)

        # z_batch: an iterator over the predictions, their scores,
        # the gold sentence, its score, and the source sentence for each
        # sentence in the batch. It has to be zip_longest instead of
        # plain-old zip because the gold_batch has length 0 if the target
        # is not included.
        z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores,
                              (sent.squeeze(1)
                               for sent in src.split(1, dim=1)))

        for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch:
            n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            words = get_src_words(src_sent,
                                  translator.fields["src"].vocab.itos)
            #print words
            gold_out_file.write(words)
            gold_out_file.write('\n')
            gold_out_file.flush()
Beispiel #33
0
def main(training=False,
         fields=None,
         model=None,
         opt=None,
         writer=None,
         step=0,
         corpus_type="dev",
         multi_process=False):
    time = Time()
    if training:
        assert fields is not None
        assert model is not None
        assert opt is not None
        model.eval()
        model.generator.eval()
        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        out_file = codecs.open(
            "{}_{}_pred_{}.txt".format(opt.save_model,
                                       corpus_type.replace("/", "_"),
                                       str(step)), "w", "utf-8")
        print("Output file: ", out_file.name)
        copy_attn = opt.copy_attn
        model_opt = opt
    else:
        # Load the model.
        parser = argparse.ArgumentParser(
            description='translate.py',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        opt = parser.parse_args()
        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]

        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)

        fields, model, model_opt = \
            onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

        out_file = codecs.open(opt.output, 'w', 'utf-8')

    assert opt.tgt is None
    data = onmt.io.build_dataset(fields,
                                 opt.src,
                                 opt.tgt,
                                 use_filter_pred=False,
                                 ngram=model_opt.ngram)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".

    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.translate_batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)
    output, pred_score_total, pred_words_total = \
            translate_single_process(opt, model, fields, data, data_iter, f=out_file)
    outfile_name = out_file.name

    if opt.bpe:
        import subprocess
        subprocess.check_output("sed 's/\@\@ //g' < {} > {}".format(
            outfile_name, outfile_name + ".nonbpe"),
                                shell=True)
        outfile_name = outfile_name + ".nonbpe"
    if opt.new_bpe:
        generate_nonbpe(outfile_name)
        outfile_name = outfile_name + ".nonbpe"
    # if writer is not None:
    #     ratio_stats.log_tensorboard(writer, step)
    # _report_score('PRED', pred_score_total, pred_words_total, writer, step, corpus_type)
    metric = 0
    if opt.tgt:
        # _report_score('GOLD', gold_score_total, gold_words_total, writer, step, corpus_type)
        if opt.report_single_bleu:
            metric = _report_single_source_bleu(opt, outfile_name, writer,
                                                step, corpus_type)
        if opt.report_multi_bleu:
            metric = _report_multi_source_bleu(outfile_name, writer, step,
                                               corpus_type)
        if opt.report_rouge:
            metric = _report_rouge(opt)

    # if opt.dump_beam:
    #     import json
    #     json.dump(translator.beam_accum,
    #               codecs.open(opt.dump_beam, 'w', 'utf-8'))

    time.timeit(task="Translation Testing")
    return metric