Ejemplo n.º 1
0
def test_training(self, engine_id):
    try:
        engine = Engine.query.filter_by(id=engine_id).first()
        test_dec_file = Corpus_File.query.filter_by(role = "target") \
                        .filter(Corpus_File.corpus_id.in_(db.session.query(Corpus_Engine.corpus_id) \
                        .filter_by(engine_id=engine_id, phase = "test", is_info=False))).first().file.path

        bleu = 0.0

        _, hyps_tmp_file = utils.tmpfile()
        _, test_crop_file = utils.tmpfile()
        joey_translate = subprocess.Popen("cat {} | head -n 2000 | python3 -m joeynmt translate {} > {}" \
                                            .format(os.path.join(engine.path, 'test.' + engine.source.code), os.path.join(engine.path, 'config.yaml'), hyps_tmp_file),
                                            cwd=app.config['JOEYNMT_FOLDER'], shell=True)
        joey_translate.wait()

        decode_hyps = subprocess.Popen("cat {} | head -n 2000 | spm_decode --model={} --input_format=piece > {}.dec" \
                                            .format(hyps_tmp_file, os.path.join(engine.path, 'train.model'), hyps_tmp_file),
                                            cwd=app.config['MUTNMT_FOLDER'], shell=True)
        decode_hyps.wait()

        crop_test = subprocess.Popen("cat {} | head -n 2000 > {}".format(
            test_dec_file, test_crop_file),
                                     cwd=app.config['MUTNMT_FOLDER'],
                                     shell=True)
        crop_test.wait()

        sacreBLEU = subprocess.Popen("cat {}.dec | sacrebleu -b {}".format(
            hyps_tmp_file, test_crop_file),
                                     cwd=app.config['MUTNMT_FOLDER'],
                                     shell=True,
                                     stdout=subprocess.PIPE)
        sacreBLEU.wait()

        score = sacreBLEU.stdout.readline().decode("utf-8")

        engine.test_task_id = None
        engine.test_score = float(score)
        db.session.commit()

        return {"bleu": float(score)}
    except Exception as e:
        db.session.rollback()
        raise e
Ejemplo n.º 2
0
def process_upload_request(user_id, bitext_file, src_file, trg_file, src_lang,
                           trg_lang, corpus_name, corpus_desc, corpus_topic):
    type = "bitext" if bitext_file else "bilingual"

    bitext_path = None
    src_path = None
    trg_path = None

    if type == "bitext":
        bitext_path = utils.tmpfile(filename=bitext_file.filename)
        bitext_file.save(bitext_path)
    else:
        src_path = utils.tmpfile(filename=src_file.filename)
        src_file.save(src_path)

        if type == "bilingual":
            trg_path = utils.tmpfile(filename=trg_file.filename)
            trg_file.save(trg_path)

    task = tasks.process_upload_request.apply_async(args=[
        user_id, bitext_path, src_path, trg_path, src_lang, trg_lang,
        corpus_name, corpus_desc, corpus_topic
    ])
    return task.id
Ejemplo n.º 3
0
def generate_xlsx(user_id, rows, ht_path_index):
    file_name = utils.normname(user_id, "evaluation") + ".xlsx"
    file_path = utils.tmpfile(file_name)

    workbook = xlsxwriter.Workbook(file_path)
    worksheet = workbook.add_worksheet()

    x_rows = []
    for i, row in enumerate(rows):
        x_row = [i + 1]

        if len(row) > 6:
            x_row = [i + 1, row[6]]

        for mt_data in row[5]:
            x_row.append(mt_data['text'])

        x_row.append(row[1])

        for mt_data in row[5]:
            x_row.append(mt_data['bleu'])

        for mt_data in row[5]:
            x_row.append(mt_data['ter'])

        x_rows.append(x_row)

    headers = ["Line"]
    headers = headers + (["Source sentence"] if len(row) > 6 else [])
    headers = headers + [
        "Machine translation {}".format(i + 1) for i in range(len(row[5]))
    ]
    headers = headers + ["Reference {}".format(ht_path_index + 1)]

    headers = headers + ["Bleu MT{}".format(i + 1) for i in range(len(row[5]))]
    headers = headers + ["TER MT{}".format(i + 1) for i in range(len(row[5]))]

    x_rows = [headers] + x_rows

    row_cursor = 0
    for row in x_rows:
        for col_cursor, col in enumerate(row):
            worksheet.write(row_cursor, col_cursor, col)
        row_cursor += 1

    workbook.close()

    return file_path
Ejemplo n.º 4
0
    def tmx_builder(self, user_id, sentences):
        engine = RunningEngines.query.filter_by(user_id=user_id).first().engine
        source_lang = engine.source.code
        target_lang = engine.target.code

        with open(os.path.join(app.config['BASE_CONFIG_FOLDER'], 'base.tmx'),
                  'r') as tmx_file:
            tmx = etree.parse(tmx_file, etree.XMLParser())
            body = tmx.getroot().find("body")
            for sentence in sentences:
                tu = etree.Element("tu")

                tuv_source = etree.Element(
                    "tuv", {
                        etree.QName("http://www.w3.org/XML/1998/namespace", "lang"):
                        source_lang
                    })
                seg_source = etree.Element("seg")
                seg_source.text = sentence.get('source')
                tuv_source.append(seg_source)
                tu.append(tuv_source)

                for target_sentence in sentence.get('target'):
                    tuv_target = etree.Element(
                        "tuv", {
                            etree.QName("http://www.w3.org/XML/1998/namespace", "lang"):
                            target_lang
                        })
                    seg_target = etree.Element("seg")
                    seg_target.text = target_sentence
                    tuv_target.append(seg_target)
                    tu.append(tuv_target)

                body.append(tu)

        tmx_path = utils.tmpfile('{}.{}-{}.tmx'.format(user_id,
                                                       engine.source.code,
                                                       engine.target.code))
        tmx.write(tmx_path, encoding="UTF-8", xml_declaration=True)

        format_proc = subprocess.Popen(
            "xmllint --format {} > {}.format".format(tmx_path, tmx_path),
            shell=True)
        format_proc.wait()

        shutil.move("{}.format".format(tmx_path), tmx_path)

        return tmx_path
Ejemplo n.º 5
0
def train_tokenizer(engine, corpus, vocabularySize=32000):
    model_path = os.path.join(engine.path, 'train.model')
    vocab_path = os.path.join(engine.path, 'train.vocab')

    try:
        os.stat(model_path)
        os.stat(vocab_path)
    except:
        files_list = []
        for file_entry in corpus.corpus_files:
            files_list.append(file_entry.file.path)
        files = " ".join(files_list)
        random_sample_path = utils.tmpfile(
            filename="{}.mut.10m".format(corpus.id))
        cat_proc = subprocess.Popen(
            "cat {} | shuf | head -n 10000000 > {}".format(
                files, random_sample_path),
            shell=True)
        cat_proc.wait()

        train_proc = subprocess.Popen("spm_train --input={} --model_prefix=mut.{} --vocab_size={} --hard_vocab_limit=false" \
                        .format(random_sample_path, corpus.id, vocabularySize),
                        cwd=utils.filepath('TMP_FOLDER'), shell=True)
        train_proc.wait()

        shutil.move(
            utils.filepath('TMP_FOLDER', "mut.{}.model".format(corpus.id)),
            model_path)
        shutil.move(
            utils.filepath('TMP_FOLDER', "mut.{}.vocab".format(corpus.id)),
            vocab_path)
        os.remove(random_sample_path)

        purge_vocab = subprocess.Popen(
            "cat {} | awk -F '\\t' '{{ print $1 }}' > {}.purged".format(
                vocab_path, vocab_path),
            shell=True)
        purge_vocab.wait()

        os.remove(vocab_path)
        shutil.move("{}.purged".format(vocab_path), vocab_path)

    return model_path, vocab_path
Ejemplo n.º 6
0
    def process_bitext(file):
        file_name, file_extension = os.path.splitext(file.filename)
        norm_name = utils.normname(user_id=user_id, filename=file_name)
        tmp_file_fd, tmp_path = utils.tmpfile()
        file.save(tmp_path)

        if file_extension == ".tmx":
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmx_file:
                tmx = etree.parse(tmx_file, etree.XMLParser())
                body = tmx.getroot().find("body")

                for tu in body.findall('.//tu'):
                    for i, tuv in enumerate(tu.findall('.//tuv')):
                        if i > 1: break
                        line = tuv.find("seg").text.strip()
                        line = re.sub(r'[\r\n\t\f\v]', " ", line)
                        dest_file = src_file if i == 0 else trg_file

                        dest_file.write(line.encode('utf-8'))
                        dest_file.write(os.linesep.encode('utf-8'))
        else:
            # We assume it is a TSV
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmp_file:
                for line in tmp_file:
                    cols = line.strip().split('\t')
                    src_file.write((cols[0] + '\n').encode('utf-8'))
                    trg_file.write((cols[1] + '\n').encode('utf-8'))

        src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"),
                        'rb')
        trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"),
                        'rb')

        return FileStorage(src_file, filename=file.filename + "-src"), \
                FileStorage(trg_file, filename=file.filename + "-trg")
Ejemplo n.º 7
0
    def process_bitext(file):
        file_name, file_extension = os.path.splitext(file.filename)
        norm_name = utils.normname(user_id=user_id, filename=file_name)
        tmp_file_fd, tmp_path = utils.tmpfile()
        file.save(tmp_path)

        data_utils.convert_file_to_utf8(tmp_path)
        data_utils.fix_file(tmp_path)

        if file_extension == ".tmx":
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'w') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'w') as trg_file, \
            open(tmp_path, 'rb') as tmx_file:
                inside_tuv = False
                seg_text = []
                tu = []

                def se(name, _):
                    nonlocal inside_tuv
                    if name == "seg":
                        inside_tuv = True

                def lp(line):
                    return re.sub(r'[\r\n\t\f\v]', " ", line.strip())

                def ee(name):
                    nonlocal inside_tuv, seg_text, tu, src_file
                    if name == "seg":
                        inside_tuv = False
                        tu.append("".join(seg_text))
                        seg_text = []

                        if len(tu) == 2:
                            print(lp(tu[0]), file=src_file)
                            print(lp(tu[1]), file=trg_file)
                            tu = []

                def cd(data):
                    nonlocal inside_tuv, seg_text
                    if inside_tuv:
                        seg_text.append(data)

                parser = xml.parsers.expat.ParserCreate()
                parser.StartElementHandler = se
                parser.EndElementHandler = ee
                parser.CharacterDataHandler = cd
                parser.ParseFile(tmx_file)

        else:
            # We assume it is a TSV
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmp_file:
                for line in tmp_file:
                    cols = line.strip().split('\t')
                    src_file.write((cols[0] + '\n').encode('utf-8'))
                    trg_file.write((cols[1] + '\n').encode('utf-8'))

        src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"),
                        'rb')
        trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"),
                        'rb')

        return FileStorage(src_file, filename=file.filename + "-src"), \
                FileStorage(trg_file, filename=file.filename + "-trg")