Beispiel #1
0
    def __init__(self, model, lang, gpu=False, wx=False):
        self.lang = lang
        self.is_ip_wx = wx
        parser = argparse.ArgumentParser(
            description='transliterate.py',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        self.opt = parser.parse_args()
        self.trans_dict = dict()
        self.broken_words = dict()
        file_path = os.path.dirname(os.path.abspath(__file__))

        if self.lang == 'hin':
            self.to_utf = WXC(order='wx2utf', lang='hin')
            self.non_alpha = re.compile(u'([^a-zA-Z]+)')
            self.alpha_letters = set(string.ascii_letters)
            self.com_abbr = {
                'b': ['BI', 'be'],
                'd': ['xI', 'xe'],
                'g': ['jI'],
                'k': ['ke', 'ki', 'kI'],
                'h': ['hE', 'hEM'],
                'ha': ['hE', 'hEM'],
                'n': ['ina', 'ne'],
                'm': ['meM', 'mEM'],
                'p': ['pe'],
                'q': ['kyoM'],
                'r': ['Ora', 'ora'],
                's': ['isa', 'se'],
                'y': ['ye']
            }

        if self.lang == 'eng':
            self.non_alpha = re.compile(u'([^a-z]+)')
            self.alpha_letters = set(string.ascii_letters[:26])
            with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp:
                self.com_abbr = {}
                for line in fp:
                    k, v = line.split()
                    self.com_abbr[k] = v.split('|')

        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]
        if gpu:
            self.opt.gpu = 0

        self.opt.cuda = self.opt.gpu > -1
        self.opt.model = model
        self.opt.n_best = 5
        self.opt.lang = lang
        if self.opt.cuda:
            torch.cuda.set_device(self.opt.gpu)

        # Load the model.
        self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model(
            self.opt, dummy_opt.__dict__)
Beispiel #2
0
def main(anno_file_name, col_headers, raw_args=None, verbose=True):
    parser = argparse.ArgumentParser(description='evaluate.py')
    opts.translate_opts(parser)
    opt = parser.parse_args(raw_args)
    torch.cuda.set_device(opt.gpu)
    opt.db_file = os.path.join(opt.data_path, '{}.db'.format(opt.split))
    opt.pre_word_vecs = os.path.join(opt.data_path, 'embedding')
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    opts.train_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]
    opt.anno = anno_file_name

    engine = DBEngine(opt.db_file)

    js_list = table.IO.read_anno_json(opt.anno)

    prev_best = (None, None)
    sql_query = []
    for fn_model in glob.glob(opt.model_path):

        opt.model = fn_model

        translator = Translator(opt, dummy_opt.__dict__)
        data = table.IO.TableDataset(js_list, translator.fields, None, False)
        test_data = table.IO.OrderedIterator(dataset=data,
                                             device=opt.gpu,
                                             batch_size=opt.batch_size,
                                             train=False,
                                             sort=True,
                                             sort_within_batch=False)

        # inference
        r_list = []
        for batch in test_data:
            r_list += translator.translate(batch)
        r_list.sort(key=lambda x: x.idx)
        pred = r_list[-1]
        sql_pred = {
            'agg': pred.agg,
            'sel': pred.sel,
            'conds': pred.recover_cond_to_gloss(js_list[-1])
        }
        if verbose:
            print('\n sql_pred: ', sql_pred, '\n')
            print('\n col_headers: ', col_headers, '\n')
        sql_query = Query(sql_pred['sel'], sql_pred['agg'], sql_pred['conds'])
        try:
            ans_pred = engine.execute_query(js_list[-1]['table_id'],
                                            Query.from_dict(sql_pred),
                                            lower=True,
                                            verbose=verbose)
        except Exception as e:
            ans_pred = None
    return sql_query.get_complete_query(col_headers), ans_pred
Beispiel #3
0
    def __init__(
            self,
            modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_97.30_ppl_1.41_e7.pt',
            dynamic_dict=True,
            attn_debug=True,
            share_vocab=True,
            replace_unk=True,
            verbose=True):
        #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_78.18_ppl_9.60_e4.pt'):
        #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_82.37_ppl_6.28_e8.pt'):
        #def __init__(self, modelfile='/data1/data1/Anirban/structure2text/model_softmax_1_acc_84.10_ppl_2.13_e1.pt'):
        print('Loading ' + modelfile)
        parser = argparse.ArgumentParser(
            description='seq2seq_predict',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)
        #opt = parser.parse_args()
        opt, unknown = parser.parse_known_args()
        print('Unknown arguments ', unknown)
        opt.dynamic_dict = dynamic_dict
        opt.attn_debug = attn_debug
        opt.share_vocab = share_vocab
        opt.replace_unk = replace_unk
        opt.verbose = verbose
        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]

        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)

        opt.src = 'temp_seq2seq_pred_%f.txt' % time.time()
        opt.model = modelfile

        print('Loading seq2seq model...')
        # Load the model.
        fields, model, model_opt = \
            onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

        self.opt = opt
        self.fields = fields
        self.model = model
        self.model_opt = model_opt
Beispiel #4
0
def translate(src, model, output):
    parser = argparse.ArgumentParser(
        description='translate.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    opts.translate_opts(parser)

    opt = parser.parse_known_args([])[0]
    if opt.batch_size != 1:
        print("WARNING: -batch_size isn't supported currently, "
              "we set it to 1 for now!")
        opt.batch_size = 1

    opt.src = src
    opt.model = model
    opt.output = output

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt, dummy_opt.__dict__)
    out_file = codecs.open(opt.output, 'w', 'utf-8')
    gold_out_file = codecs.open("gold_" + opt.output, 'w', 'utf-8')

    #print "TRANSLATOR SOURCE VOCAB"
    #for i in range(len(translator.fields["src"].vocab.itos)):
    #    print i, translator.fields["src"].vocab.itos[i]
    #print

    data = onmt.IO.ONMTDataset(opt.src,
                               opt.tgt,
                               translator.fields,
                               use_filter_pred=False)

    test_data = onmt.IO.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        shuffle=False)

    counter = count(1)
    for batch in test_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src \
            = translator.translate(batch, data)

        # z_batch: an iterator over the predictions, their scores,
        # the gold sentence, its score, and the source sentence for each
        # sentence in the batch. It has to be zip_longest instead of
        # plain-old zip because the gold_batch has length 0 if the target
        # is not included.
        z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores,
                              (sent.squeeze(1)
                               for sent in src.split(1, dim=1)))

        for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch:
            n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            words = get_src_words(src_sent,
                                  translator.fields["src"].vocab.itos)
            #print words
            gold_out_file.write(words)
            gold_out_file.write('\n')
            gold_out_file.flush()
Beispiel #5
0
from __future__ import division
import os
import argparse
import torch
import codecs
import glob

import table
import table.IO
import opts

parser = argparse.ArgumentParser(description='evaluate.py')
opts.translate_opts(parser)
opt = parser.parse_args()
torch.cuda.set_device(opt.gpu)
opt.anno = os.path.join(opt.root_dir, opt.dataset, '{}.json'.format(opt.split))
opt.bpe_path = os.path.join(opt.root_dir, opt.dataset, 'bpe.pt')
opt.pre_word_vecs = os.path.join(opt.root_dir, opt.dataset, 'embedding')

if opt.beam_size > 0:
    opt.batch_size = 1


def get_run_epoch_by_fn(fn_model):
    tk_list = fn_model.split('/')
    for tk in tk_list:
        if tk.startswith('run.'):
            _run = tk[4:]
        elif tk.startswith('m_'):
            _epoch = tk.split('_')[1]
    return int(_run), int(_epoch)
Beispiel #6
0
def main(training=False,
         fields=None,
         model=None,
         opt=None,
         writer=None,
         step=0,
         corpus_type="dev",
         multi_process=False):
    time = Time()
    if training:
        assert fields is not None
        assert model is not None
        assert opt is not None
        model.eval()
        model.generator.eval()
        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        out_file = codecs.open(
            "{}_{}_pred_{}.txt".format(opt.save_model,
                                       corpus_type.replace("/", "_"),
                                       str(step)), "w", "utf-8")
        print("Output file: ", out_file.name)
        copy_attn = opt.copy_attn
        model_opt = opt
    else:
        # Load the model.
        parser = argparse.ArgumentParser(
            description='translate.py',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        opt = parser.parse_args()
        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]

        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)

        fields, model, model_opt = \
            onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

        out_file = codecs.open(opt.output, 'w', 'utf-8')

    assert opt.tgt is None
    data = onmt.io.build_dataset(fields,
                                 opt.src,
                                 opt.tgt,
                                 use_filter_pred=False,
                                 ngram=model_opt.ngram)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".

    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.translate_batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)
    output, pred_score_total, pred_words_total = \
            translate_single_process(opt, model, fields, data, data_iter, f=out_file)
    outfile_name = out_file.name

    if opt.bpe:
        import subprocess
        subprocess.check_output("sed 's/\@\@ //g' < {} > {}".format(
            outfile_name, outfile_name + ".nonbpe"),
                                shell=True)
        outfile_name = outfile_name + ".nonbpe"
    if opt.new_bpe:
        generate_nonbpe(outfile_name)
        outfile_name = outfile_name + ".nonbpe"
    # if writer is not None:
    #     ratio_stats.log_tensorboard(writer, step)
    # _report_score('PRED', pred_score_total, pred_words_total, writer, step, corpus_type)
    metric = 0
    if opt.tgt:
        # _report_score('GOLD', gold_score_total, gold_words_total, writer, step, corpus_type)
        if opt.report_single_bleu:
            metric = _report_single_source_bleu(opt, outfile_name, writer,
                                                step, corpus_type)
        if opt.report_multi_bleu:
            metric = _report_multi_source_bleu(outfile_name, writer, step,
                                               corpus_type)
        if opt.report_rouge:
            metric = _report_rouge(opt)

    # if opt.dump_beam:
    #     import json
    #     json.dump(translator.beam_accum,
    #               codecs.open(opt.dump_beam, 'w', 'utf-8'))

    time.timeit(task="Translation Testing")
    return metric
Beispiel #7
0
def get_model_api():
    """Returns lambda function for api"""

    # initialize model once and for all

    # initialize config for translate
    parser = argparse.ArgumentParser(
        description='translate.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    opts.add_md_help_argument(parser)
    opts.translate_opts(parser)
    opt = parser.parse_args()

    # initialize config for model
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta,
                                             opt.coverage_penalty,
                                             opt.length_penalty)
    translator = onmt.translate.Translator(model,
                                           fields,
                                           beam_size=opt.beam_size,
                                           n_best=opt.n_best,
                                           global_scorer=scorer,
                                           max_length=opt.max_length,
                                           copy_attn=model_opt.copy_attn,
                                           cuda=opt.cuda,
                                           beam_trace=opt.dump_beam != "",
                                           min_length=opt.min_length)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    #    hw_count = 0
    #    start_0 = current_milli_time()

    def model_api(input_data):
        """
        Args:
            input_data: submitted to the API, json string

        Returns:
            output_data: after some transformation, to be
                returned to the API

        """

        # process input
        global hw_count
        global start_0
        res = {}
        request_id = str(uuid.uuid4())
        res['id'] = input_data['id']
        scgink = input_data['scg_ink']
        try:
            scgink_data = ScgImage(scgink, request_id)
        except:
            res['status'] = 'error'
            res['info'] = 'bad scgink data'
            return res
        # empty traces due to scgink data
        if not scgink_data.traces:
            res['info'] = 'wrong scgink data'
            res['status'] = 'error'
            return res

        start_t = current_milli_time()

        img_file_path = outdir + '/' + request_id + '_input.png'
        #convert to png format
        scgink_data.save_image(img_file_path)

        #preprocess image
        filename, postfix, processed_img = img_file_path, '.png', outdir + '/' + request_id + '_preprocessed.png'
        crop_blank_default_size, pad_size, buckets, downsample_ratio = [
            600, 60
        ], (8, 8, 8, 8), default_buckets, 2

        l = (filename, postfix, processed_img, crop_blank_default_size,
             pad_size, buckets, downsample_ratio)
        if not preprocess(l):
            res['status'] = 'error'
            return res

        # construct data
        os.system('echo ' + request_id + '_preprocessed.png ' +
                  '>temp/test.txt')
        src = 'temp/test.txt'
        src_dir = 'temp'
        #print "src=", src
        #print "src_dir=", src_dir
        data = onmt.io.build_dataset(fields,
                                     opt.data_type,
                                     src,
                                     None,
                                     src_dir=src_dir,
                                     sample_rate=opt.sample_rate,
                                     window_size=opt.window_size,
                                     window_stride=opt.window_stride,
                                     window=opt.window,
                                     use_filter_pred=False)

        # Sort batch by decreasing lengths of sentence required by pytorch.
        # sort=False means "Use dataset's sortkey instead of iterator's".
        data_iter = onmt.io.OrderedIterator(dataset=data,
                                            device=opt.gpu,
                                            batch_size=opt.batch_size,
                                            train=False,
                                            sort=False,
                                            sort_within_batch=True,
                                            shuffle=False)

        # Inference
        builder = onmt.translate.TranslationBuilder(data, translator.fields,
                                                    opt.n_best,
                                                    opt.replace_unk, opt.tgt)

        cnt = 0
        for batch in data_iter:
            batch_data = translator.translate_batch(batch, data)
            translations = builder.from_batch(batch_data)

            for trans in translations:
                cnt += 1
                n_best_preds = [
                    " ".join(pred) for pred in trans.pred_sents[:opt.n_best]
                ]

        now_t = current_milli_time()
        #hw_count = hw_count + 1
        #if hw_count %100 == 0 :
        #    app.logger.debug( "last 100 "+(now_t - start_0 ))
        #    start_0 = now_t
        #    app.logger.debug(  "time spent "+( now_t -start_t))

        # process the output
        n_best_latex = []
        for pred in n_best_preds:
            n_best_latex.append(detokenizer(pred))

        n_best_ascii = []
        for pred in n_best_latex:
            n_best_ascii.append(latex_asciimath(pred))

        # return the output for the api
        res['status'] = "succuss"
        res['info'] = now_t - start_t
        res['mathml'] = ''
        res['latex'] = n_best_latex[0]
        res['asciimath'] = n_best_ascii[0]
        res['n_best_latex'] = n_best_latex
        res['n_best_ascii'] = n_best_ascii
        app.logger.debug(request_id + "\t" + n_best_latex[0] + "\n")

        return res

    return model_api
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(
        description='translate.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    opts.add_md_help_argument(parser)
    opts.translate_opts(parser)
    group = parser.add_argument_group('Rerank')
    group.add_argument('-templates',
                       required=True,
                       help="Path to the test templates")
    opt = parser.parse_args()
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        model_utils.load_test_model(opt, dummy_opt.__dict__)

    fields["spliter_pos"] = torchtext.data.Field(use_vocab=False,
                                                 dtype=torch.long,
                                                 sequential=False)

    # Unfold templates
    src_path, tmp_path = txt_utils.unfold_templates(opt.src, opt.templates)

    # Test data
    data = txt_utils.build_template_dataset(fields,
                                            src_path,
                                            None,
                                            tmp_path,
                                            use_filter_pred=False,
                                            with_pos=True,
                                            dynamic_dict=False)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)

    count = 0
    #offset=0
    #scores=[]
    score_dict = {}
    for batch in data_iter:
        #print(batch.indices)
        #index=batch.indices-offset
        src = onmt.io.make_features(batch, 'src', 'text')
        predict_score = model.predict_rouge(src, batch.src[1],
                                            batch.spliter_pos)
        #ordered_score=predict_score[index].data
        #scores.extend(ordered_score)
        #offset+=index.size(0)
        for index, score in zip(batch.indices.data, predict_score.data):
            score_dict[int(index)] = float(score)
        count += 1
        if count % 100 == 0:
            print('score {} batches'.format(count))
        #if count>10: break

    # File to write sentences to.
    score_file = opt.output + '.score'
    print('score_file is ' + score_file)
    print('opt.tgt is ' + opt.tgt)
    out_file = open(score_file, 'w', encoding='utf-8')
    print(len(score_dict))
    for index in range(len(score_dict)):
        print(score_dict[index], file=out_file)
    out_file.close()
    select_templates(src_path, tmp_path, score_file, opt.output, opt.tgt)
Beispiel #9
0
import torch

from itertools import count

import onmt.io
import onmt.translate
import onmt
import onmt.ModelConstructor
import onmt.modules
import opts

parser = argparse.ArgumentParser(
    description='translate.py',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
opts.add_md_help_argument(parser)
opts.translate_opts(parser)

opt = parser.parse_args()


def _report_score(name, score_total, words_total):
    print("%s AVG SCORE: %.4f, %s PPL: %.4f" % (
        name, score_total / words_total,
        name, math.exp(-score_total / words_total)))


def _report_bleu():
    import subprocess
    path = os.path.split(os.path.realpath(__file__))[0]
    print()
    res = subprocess.check_output(