def test_training(self, engine_id): try: engine = Engine.query.filter_by(id=engine_id).first() test_dec_file = Corpus_File.query.filter_by(role = "target") \ .filter(Corpus_File.corpus_id.in_(db.session.query(Corpus_Engine.corpus_id) \ .filter_by(engine_id=engine_id, phase = "test", is_info=False))).first().file.path bleu = 0.0 _, hyps_tmp_file = utils.tmpfile() _, test_crop_file = utils.tmpfile() joey_translate = subprocess.Popen("cat {} | head -n 2000 | python3 -m joeynmt translate {} > {}" \ .format(os.path.join(engine.path, 'test.' + engine.source.code), os.path.join(engine.path, 'config.yaml'), hyps_tmp_file), cwd=app.config['JOEYNMT_FOLDER'], shell=True) joey_translate.wait() decode_hyps = subprocess.Popen("cat {} | head -n 2000 | spm_decode --model={} --input_format=piece > {}.dec" \ .format(hyps_tmp_file, os.path.join(engine.path, 'train.model'), hyps_tmp_file), cwd=app.config['MUTNMT_FOLDER'], shell=True) decode_hyps.wait() crop_test = subprocess.Popen("cat {} | head -n 2000 > {}".format( test_dec_file, test_crop_file), cwd=app.config['MUTNMT_FOLDER'], shell=True) crop_test.wait() sacreBLEU = subprocess.Popen("cat {}.dec | sacrebleu -b {}".format( hyps_tmp_file, test_crop_file), cwd=app.config['MUTNMT_FOLDER'], shell=True, stdout=subprocess.PIPE) sacreBLEU.wait() score = sacreBLEU.stdout.readline().decode("utf-8") engine.test_task_id = None engine.test_score = float(score) db.session.commit() return {"bleu": float(score)} except Exception as e: db.session.rollback() raise e
def process_upload_request(user_id, bitext_file, src_file, trg_file, src_lang, trg_lang, corpus_name, corpus_desc, corpus_topic): type = "bitext" if bitext_file else "bilingual" bitext_path = None src_path = None trg_path = None if type == "bitext": bitext_path = utils.tmpfile(filename=bitext_file.filename) bitext_file.save(bitext_path) else: src_path = utils.tmpfile(filename=src_file.filename) src_file.save(src_path) if type == "bilingual": trg_path = utils.tmpfile(filename=trg_file.filename) trg_file.save(trg_path) task = tasks.process_upload_request.apply_async(args=[ user_id, bitext_path, src_path, trg_path, src_lang, trg_lang, corpus_name, corpus_desc, corpus_topic ]) return task.id
def generate_xlsx(user_id, rows, ht_path_index): file_name = utils.normname(user_id, "evaluation") + ".xlsx" file_path = utils.tmpfile(file_name) workbook = xlsxwriter.Workbook(file_path) worksheet = workbook.add_worksheet() x_rows = [] for i, row in enumerate(rows): x_row = [i + 1] if len(row) > 6: x_row = [i + 1, row[6]] for mt_data in row[5]: x_row.append(mt_data['text']) x_row.append(row[1]) for mt_data in row[5]: x_row.append(mt_data['bleu']) for mt_data in row[5]: x_row.append(mt_data['ter']) x_rows.append(x_row) headers = ["Line"] headers = headers + (["Source sentence"] if len(row) > 6 else []) headers = headers + [ "Machine translation {}".format(i + 1) for i in range(len(row[5])) ] headers = headers + ["Reference {}".format(ht_path_index + 1)] headers = headers + ["Bleu MT{}".format(i + 1) for i in range(len(row[5]))] headers = headers + ["TER MT{}".format(i + 1) for i in range(len(row[5]))] x_rows = [headers] + x_rows row_cursor = 0 for row in x_rows: for col_cursor, col in enumerate(row): worksheet.write(row_cursor, col_cursor, col) row_cursor += 1 workbook.close() return file_path
def tmx_builder(self, user_id, sentences): engine = RunningEngines.query.filter_by(user_id=user_id).first().engine source_lang = engine.source.code target_lang = engine.target.code with open(os.path.join(app.config['BASE_CONFIG_FOLDER'], 'base.tmx'), 'r') as tmx_file: tmx = etree.parse(tmx_file, etree.XMLParser()) body = tmx.getroot().find("body") for sentence in sentences: tu = etree.Element("tu") tuv_source = etree.Element( "tuv", { etree.QName("http://www.w3.org/XML/1998/namespace", "lang"): source_lang }) seg_source = etree.Element("seg") seg_source.text = sentence.get('source') tuv_source.append(seg_source) tu.append(tuv_source) for target_sentence in sentence.get('target'): tuv_target = etree.Element( "tuv", { etree.QName("http://www.w3.org/XML/1998/namespace", "lang"): target_lang }) seg_target = etree.Element("seg") seg_target.text = target_sentence tuv_target.append(seg_target) tu.append(tuv_target) body.append(tu) tmx_path = utils.tmpfile('{}.{}-{}.tmx'.format(user_id, engine.source.code, engine.target.code)) tmx.write(tmx_path, encoding="UTF-8", xml_declaration=True) format_proc = subprocess.Popen( "xmllint --format {} > {}.format".format(tmx_path, tmx_path), shell=True) format_proc.wait() shutil.move("{}.format".format(tmx_path), tmx_path) return tmx_path
def train_tokenizer(engine, corpus, vocabularySize=32000): model_path = os.path.join(engine.path, 'train.model') vocab_path = os.path.join(engine.path, 'train.vocab') try: os.stat(model_path) os.stat(vocab_path) except: files_list = [] for file_entry in corpus.corpus_files: files_list.append(file_entry.file.path) files = " ".join(files_list) random_sample_path = utils.tmpfile( filename="{}.mut.10m".format(corpus.id)) cat_proc = subprocess.Popen( "cat {} | shuf | head -n 10000000 > {}".format( files, random_sample_path), shell=True) cat_proc.wait() train_proc = subprocess.Popen("spm_train --input={} --model_prefix=mut.{} --vocab_size={} --hard_vocab_limit=false" \ .format(random_sample_path, corpus.id, vocabularySize), cwd=utils.filepath('TMP_FOLDER'), shell=True) train_proc.wait() shutil.move( utils.filepath('TMP_FOLDER', "mut.{}.model".format(corpus.id)), model_path) shutil.move( utils.filepath('TMP_FOLDER', "mut.{}.vocab".format(corpus.id)), vocab_path) os.remove(random_sample_path) purge_vocab = subprocess.Popen( "cat {} | awk -F '\\t' '{{ print $1 }}' > {}.purged".format( vocab_path, vocab_path), shell=True) purge_vocab.wait() os.remove(vocab_path) shutil.move("{}.purged".format(vocab_path), vocab_path) return model_path, vocab_path
def process_bitext(file): file_name, file_extension = os.path.splitext(file.filename) norm_name = utils.normname(user_id=user_id, filename=file_name) tmp_file_fd, tmp_path = utils.tmpfile() file.save(tmp_path) if file_extension == ".tmx": with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmx_file: tmx = etree.parse(tmx_file, etree.XMLParser()) body = tmx.getroot().find("body") for tu in body.findall('.//tu'): for i, tuv in enumerate(tu.findall('.//tuv')): if i > 1: break line = tuv.find("seg").text.strip() line = re.sub(r'[\r\n\t\f\v]', " ", line) dest_file = src_file if i == 0 else trg_file dest_file.write(line.encode('utf-8')) dest_file.write(os.linesep.encode('utf-8')) else: # We assume it is a TSV with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmp_file: for line in tmp_file: cols = line.strip().split('\t') src_file.write((cols[0] + '\n').encode('utf-8')) trg_file.write((cols[1] + '\n').encode('utf-8')) src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'rb') trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'rb') return FileStorage(src_file, filename=file.filename + "-src"), \ FileStorage(trg_file, filename=file.filename + "-trg")
def process_bitext(file): file_name, file_extension = os.path.splitext(file.filename) norm_name = utils.normname(user_id=user_id, filename=file_name) tmp_file_fd, tmp_path = utils.tmpfile() file.save(tmp_path) data_utils.convert_file_to_utf8(tmp_path) data_utils.fix_file(tmp_path) if file_extension == ".tmx": with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'w') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'w') as trg_file, \ open(tmp_path, 'rb') as tmx_file: inside_tuv = False seg_text = [] tu = [] def se(name, _): nonlocal inside_tuv if name == "seg": inside_tuv = True def lp(line): return re.sub(r'[\r\n\t\f\v]', " ", line.strip()) def ee(name): nonlocal inside_tuv, seg_text, tu, src_file if name == "seg": inside_tuv = False tu.append("".join(seg_text)) seg_text = [] if len(tu) == 2: print(lp(tu[0]), file=src_file) print(lp(tu[1]), file=trg_file) tu = [] def cd(data): nonlocal inside_tuv, seg_text if inside_tuv: seg_text.append(data) parser = xml.parsers.expat.ParserCreate() parser.StartElementHandler = se parser.EndElementHandler = ee parser.CharacterDataHandler = cd parser.ParseFile(tmx_file) else: # We assume it is a TSV with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmp_file: for line in tmp_file: cols = line.strip().split('\t') src_file.write((cols[0] + '\n').encode('utf-8')) trg_file.write((cols[1] + '\n').encode('utf-8')) src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'rb') trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'rb') return FileStorage(src_file, filename=file.filename + "-src"), \ FileStorage(trg_file, filename=file.filename + "-trg")