def evaluate_files(): mt_files = request.files.getlist('mt_files[]') ht_files = request.files.getlist('ht_files[]') source_file = request.files.get('source_file') line_length = None def save_file(file, path, limit=500): with open(path, 'w') as output_file: for i, line in enumerate(file): if i < limit: print(line.decode('utf-8').strip(), file=output_file) mt_paths = [] for mt_file in mt_files: mt_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), mt_file.filename)) save_file(mt_file, mt_path) if not line_length: line_length = utils.file_length(mt_path) elif utils.file_length(mt_path) != line_length: return ({"result": "-1"}) mt_paths.append(mt_path) ht_paths = [] for ht_file in ht_files: ht_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), ht_file.filename)) save_file(ht_file, ht_path) if not line_length: line_length = utils.file_length(ht_path) elif utils.file_length(ht_path) != line_length: return ({"result": "-1"}) ht_paths.append(ht_path) if source_file: source_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), source_file.filename)) save_file(source_file, source_path) if utils.file_length(ht_path) != utils.file_length(source_path): return ({"result": "-1"}) task = tasks.evaluate_files.apply_async( args=[user_utils.get_uid(), mt_paths, ht_paths], kwargs={'source_path': source_path if source_file else None}) return ({"result": 200, "task_id": task.id})
def generate_xlsx(user_id, rows, ht_path_index): file_name = utils.normname(user_id, "evaluation") + ".xlsx" file_path = utils.tmpfile(file_name) workbook = xlsxwriter.Workbook(file_path) worksheet = workbook.add_worksheet() x_rows = [] for i, row in enumerate(rows): x_row = [i + 1] if len(row) > 6: x_row = [i + 1, row[6]] for mt_data in row[5]: x_row.append(mt_data['text']) x_row.append(row[1]) for mt_data in row[5]: x_row.append(mt_data['bleu']) for mt_data in row[5]: x_row.append(mt_data['ter']) x_rows.append(x_row) headers = ["Line"] headers = headers + (["Source sentence"] if len(row) > 6 else []) headers = headers + [ "Machine translation {}".format(i + 1) for i in range(len(row[5])) ] headers = headers + ["Reference {}".format(ht_path_index + 1)] headers = headers + ["Bleu MT{}".format(i + 1) for i in range(len(row[5]))] headers = headers + ["TER MT{}".format(i + 1) for i in range(len(row[5]))] x_rows = [headers] + x_rows row_cursor = 0 for row in x_rows: for col_cursor, col in enumerate(row): worksheet.write(row_cursor, col_cursor, col) row_cursor += 1 workbook.close() return file_path
def train_start(): if user_utils.is_normal(): return url_for('index') engine_path = os.path.join( user_utils.get_user_folder("engines"), utils.normname(user_utils.get_user().username, request.form['nameText'])) task = tasks.launch_training.apply_async(args=[ user_utils.get_uid(), engine_path, { i[0]: i[1] if i[0].endswith('[]') else i[1][0] for i in request.form.lists() } ]) return jsonify({ "result": 200, "launching_url": url_for('train.train_launching', task_id=task.id) })
def process_bitext(file): file_name, file_extension = os.path.splitext(file.filename) norm_name = utils.normname(user_id=user_id, filename=file_name) tmp_file_fd, tmp_path = utils.tmpfile() file.save(tmp_path) if file_extension == ".tmx": with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmx_file: tmx = etree.parse(tmx_file, etree.XMLParser()) body = tmx.getroot().find("body") for tu in body.findall('.//tu'): for i, tuv in enumerate(tu.findall('.//tuv')): if i > 1: break line = tuv.find("seg").text.strip() line = re.sub(r'[\r\n\t\f\v]', " ", line) dest_file = src_file if i == 0 else trg_file dest_file.write(line.encode('utf-8')) dest_file.write(os.linesep.encode('utf-8')) else: # We assume it is a TSV with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmp_file: for line in tmp_file: cols = line.strip().split('\t') src_file.write((cols[0] + '\n').encode('utf-8')) trg_file.write((cols[1] + '\n').encode('utf-8')) src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'rb') trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'rb') return FileStorage(src_file, filename=file.filename + "-src"), \ FileStorage(trg_file, filename=file.filename + "-trg")
def upload_file(): engine_id = request.form.get('engine_id') user_file = request.files.get('user_file') as_tmx = request.form.get('as_tmx') == 'true' tmx_mode = request.form.get('tmx_mode') key = utils.normname(user_utils.get_uid(), user_file.filename) this_upload = user_utils.get_user_folder(key) try: os.mkdir(this_upload) except: shutil.rmtree(this_upload) os.mkdir(this_upload) user_file_path = os.path.join(this_upload, secure_filename(user_file.filename)) user_file.save(user_file_path) translators.set_admin(user_utils.is_admin()) translation_task_id = translators.translate_file(user_utils.get_uid(), engine_id, user_file_path, as_tmx, tmx_mode) return jsonify({ "result": 200, "task_id": translation_task_id })
def process_bitext(file): file_name, file_extension = os.path.splitext(file.filename) norm_name = utils.normname(user_id=user_id, filename=file_name) tmp_file_fd, tmp_path = utils.tmpfile() file.save(tmp_path) data_utils.convert_file_to_utf8(tmp_path) data_utils.fix_file(tmp_path) if file_extension == ".tmx": with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'w') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'w') as trg_file, \ open(tmp_path, 'rb') as tmx_file: inside_tuv = False seg_text = [] tu = [] def se(name, _): nonlocal inside_tuv if name == "seg": inside_tuv = True def lp(line): return re.sub(r'[\r\n\t\f\v]', " ", line.strip()) def ee(name): nonlocal inside_tuv, seg_text, tu, src_file if name == "seg": inside_tuv = False tu.append("".join(seg_text)) seg_text = [] if len(tu) == 2: print(lp(tu[0]), file=src_file) print(lp(tu[1]), file=trg_file) tu = [] def cd(data): nonlocal inside_tuv, seg_text if inside_tuv: seg_text.append(data) parser = xml.parsers.expat.ParserCreate() parser.StartElementHandler = se parser.EndElementHandler = ee parser.CharacterDataHandler = cd parser.ParseFile(tmx_file) else: # We assume it is a TSV with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmp_file: for line in tmp_file: cols = line.strip().split('\t') src_file.write((cols[0] + '\n').encode('utf-8')) trg_file.write((cols[1] + '\n').encode('utf-8')) src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'rb') trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'rb') return FileStorage(src_file, filename=file.filename + "-src"), \ FileStorage(trg_file, filename=file.filename + "-trg")
def upload_file(file, language, format="text", selected_size=None, offset=None, user_id=None): user_id = user_id if user_id else user_utils.get_uid() norm_name = utils.normname(user_id=user_id, filename=file.filename) path = utils.filepath('FILES_FOLDER', norm_name) def new_file(file, path, selected_size=None): # We save it file.seek(0) file.save(path) # Convert whatever format this has to UTF-8 convert_file_to_utf8(path) fix_file(path) hash = utils.hash(file) if selected_size is not None: # We get the amount of sentences we want crop_path = "{}.crop".format(path) if offset: crop_proccess = subprocess.Popen( "cat {} " "| head -n {} " "| tail -n {} > {}".format( path, int(offset) + int(selected_size), selected_size, crop_path), shell=True) crop_proccess.wait() else: crop_proccess = subprocess.Popen( "cat {} | head -n {} > {}".format(path, selected_size, crop_path), shell=True) crop_proccess.wait() os.remove(path) shutil.move(crop_path, path) with open(path, 'r') as crop_file: hash = utils.hash(crop_file) # Get file stats wc_output = subprocess.check_output('wc -lwc {}'.format(path), shell=True) wc_output_search = re.search(r'^(\s*)(\d+)(\s+)(\d+)(\s+)(\d+)(.*)$', wc_output.decode("utf-8")) lines, words, chars = wc_output_search.group( 2), wc_output_search.group(4), wc_output_search.group(6) # Save in DB db_file = File(path=path, name=file.filename, user_language_id=language, hash=hash, uploader_id=user_id, lines=lines, words=words, chars=chars, uploaded=datetime.datetime.utcnow()) return db_file if selected_size is not None: return new_file(file, path, selected_size) else: # Could we already have it stored? hash = utils.hash(file) query = File.query.filter_by(hash=hash) db_file = None try: db_file = query.first() if db_file is None: raise NoResultFound # We did have it, we link a new one to the existing one instead of re-uploading os.link(db_file.path, path) db_file = File(path=path, name=file.filename, uploaded=db_file.uploaded, hash=hash, uploader_id=user_id, language_id=db_file.language_id, lines=db_file.lines, words=db_file.words, chars=db_file.chars) except NoResultFound: db_file = new_file(file, path) return db_file