def evaluate_files(): mt_files = request.files.getlist('mt_files[]') ht_files = request.files.getlist('ht_files[]') source_file = request.files.get('source_file') line_length = None def save_file(file, path, limit=500): with open(path, 'w') as output_file: for i, line in enumerate(file): if i < limit: print(line.decode('utf-8').strip(), file=output_file) mt_paths = [] for mt_file in mt_files: mt_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), mt_file.filename)) save_file(mt_file, mt_path) if not line_length: line_length = utils.file_length(mt_path) elif utils.file_length(mt_path) != line_length: return ({"result": "-1"}) mt_paths.append(mt_path) ht_paths = [] for ht_file in ht_files: ht_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), ht_file.filename)) save_file(ht_file, ht_path) if not line_length: line_length = utils.file_length(ht_path) elif utils.file_length(ht_path) != line_length: return ({"result": "-1"}) ht_paths.append(ht_path) if source_file: source_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), source_file.filename)) save_file(source_file, source_path) if utils.file_length(ht_path) != utils.file_length(source_path): return ({"result": "-1"}) task = tasks.evaluate_files.apply_async( args=[user_utils.get_uid(), mt_paths, ht_paths], kwargs={'source_path': source_path if source_file else None}) return ({"result": 200, "task_id": task.id})
def shuffle_sentences(corpus): source_files = [f.file for f in corpus.corpus_files if f.role == "source"] target_files = [f.file for f in corpus.corpus_files if f.role == "target"] # Only shuffle single file corpora if len(source_files) == 1 and len(target_files) == 1: source_file, target_file = source_files[0], target_files[0] shuff_proc = subprocess.Popen( "paste {} {} | shuf > mut.{}.shuf".format(source_file.path, target_file.path, corpus.id), shell=True, cwd=app.config['TMP_FOLDER']) shuff_proc.wait() extract_source = subprocess.Popen( "cat mut.{}.shuf | awk -F '\\t' '{{ print $1 }}' > {}".format( corpus.id, source_file.path), shell=True, cwd=app.config['TMP_FOLDER']) extract_source.wait() extract_target = subprocess.Popen( "cat mut.{}.shuf | awk -F '\\t' '{{ print $2 }}' > {}".format( corpus.id, target_file.path), shell=True, cwd=app.config['TMP_FOLDER']) extract_target.wait() os.remove( utils.filepath('TMP_FOLDER', filename='mut.{}.shuf'.format(corpus.id))) else: raise Exception("Corpora with multiple files cannot be shuffled")
def train_tokenizer(engine, corpus, vocabularySize=32000): model_path = os.path.join(engine.path, 'train.model') vocab_path = os.path.join(engine.path, 'train.vocab') try: os.stat(model_path) os.stat(vocab_path) except: files_list = [] for file_entry in corpus.corpus_files: files_list.append(file_entry.file.path) files = " ".join(files_list) random_sample_path = utils.tmpfile( filename="{}.mut.10m".format(corpus.id)) cat_proc = subprocess.Popen( "cat {} | shuf | head -n 10000000 > {}".format( files, random_sample_path), shell=True) cat_proc.wait() train_proc = subprocess.Popen("spm_train --input={} --model_prefix=mut.{} --vocab_size={} --hard_vocab_limit=false" \ .format(random_sample_path, corpus.id, vocabularySize), cwd=utils.filepath('TMP_FOLDER'), shell=True) train_proc.wait() shutil.move( utils.filepath('TMP_FOLDER', "mut.{}.model".format(corpus.id)), model_path) shutil.move( utils.filepath('TMP_FOLDER', "mut.{}.vocab".format(corpus.id)), vocab_path) os.remove(random_sample_path) purge_vocab = subprocess.Popen( "cat {} | awk -F '\\t' '{{ print $1 }}' > {}.purged".format( vocab_path, vocab_path), shell=True) purge_vocab.wait() os.remove(vocab_path) shutil.move("{}.purged".format(vocab_path), vocab_path) return model_path, vocab_path
def process_bitext(file): file_name, file_extension = os.path.splitext(file.filename) norm_name = utils.normname(user_id=user_id, filename=file_name) tmp_file_fd, tmp_path = utils.tmpfile() file.save(tmp_path) if file_extension == ".tmx": with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmx_file: tmx = etree.parse(tmx_file, etree.XMLParser()) body = tmx.getroot().find("body") for tu in body.findall('.//tu'): for i, tuv in enumerate(tu.findall('.//tuv')): if i > 1: break line = tuv.find("seg").text.strip() line = re.sub(r'[\r\n\t\f\v]', " ", line) dest_file = src_file if i == 0 else trg_file dest_file.write(line.encode('utf-8')) dest_file.write(os.linesep.encode('utf-8')) else: # We assume it is a TSV with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmp_file: for line in tmp_file: cols = line.strip().split('\t') src_file.write((cols[0] + '\n').encode('utf-8')) trg_file.write((cols[1] + '\n').encode('utf-8')) src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'rb') trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'rb') return FileStorage(src_file, filename=file.filename + "-src"), \ FileStorage(trg_file, filename=file.filename + "-trg")
def process_bitext(file): file_name, file_extension = os.path.splitext(file.filename) norm_name = utils.normname(user_id=user_id, filename=file_name) tmp_file_fd, tmp_path = utils.tmpfile() file.save(tmp_path) data_utils.convert_file_to_utf8(tmp_path) data_utils.fix_file(tmp_path) if file_extension == ".tmx": with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'w') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'w') as trg_file, \ open(tmp_path, 'rb') as tmx_file: inside_tuv = False seg_text = [] tu = [] def se(name, _): nonlocal inside_tuv if name == "seg": inside_tuv = True def lp(line): return re.sub(r'[\r\n\t\f\v]', " ", line.strip()) def ee(name): nonlocal inside_tuv, seg_text, tu, src_file if name == "seg": inside_tuv = False tu.append("".join(seg_text)) seg_text = [] if len(tu) == 2: print(lp(tu[0]), file=src_file) print(lp(tu[1]), file=trg_file) tu = [] def cd(data): nonlocal inside_tuv, seg_text if inside_tuv: seg_text.append(data) parser = xml.parsers.expat.ParserCreate() parser.StartElementHandler = se parser.EndElementHandler = ee parser.CharacterDataHandler = cd parser.ParseFile(tmx_file) else: # We assume it is a TSV with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmp_file: for line in tmp_file: cols = line.strip().split('\t') src_file.write((cols[0] + '\n').encode('utf-8')) trg_file.write((cols[1] + '\n').encode('utf-8')) src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'rb') trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'rb') return FileStorage(src_file, filename=file.filename + "-src"), \ FileStorage(trg_file, filename=file.filename + "-trg")
def upload_file(file, language, format="text", selected_size=None, offset=None, user_id=None): user_id = user_id if user_id else user_utils.get_uid() norm_name = utils.normname(user_id=user_id, filename=file.filename) path = utils.filepath('FILES_FOLDER', norm_name) def new_file(file, path, selected_size=None): # We save it file.seek(0) file.save(path) # Convert whatever format this has to UTF-8 convert_file_to_utf8(path) fix_file(path) hash = utils.hash(file) if selected_size is not None: # We get the amount of sentences we want crop_path = "{}.crop".format(path) if offset: crop_proccess = subprocess.Popen( "cat {} " "| head -n {} " "| tail -n {} > {}".format( path, int(offset) + int(selected_size), selected_size, crop_path), shell=True) crop_proccess.wait() else: crop_proccess = subprocess.Popen( "cat {} | head -n {} > {}".format(path, selected_size, crop_path), shell=True) crop_proccess.wait() os.remove(path) shutil.move(crop_path, path) with open(path, 'r') as crop_file: hash = utils.hash(crop_file) # Get file stats wc_output = subprocess.check_output('wc -lwc {}'.format(path), shell=True) wc_output_search = re.search(r'^(\s*)(\d+)(\s+)(\d+)(\s+)(\d+)(.*)$', wc_output.decode("utf-8")) lines, words, chars = wc_output_search.group( 2), wc_output_search.group(4), wc_output_search.group(6) # Save in DB db_file = File(path=path, name=file.filename, user_language_id=language, hash=hash, uploader_id=user_id, lines=lines, words=words, chars=chars, uploaded=datetime.datetime.utcnow()) return db_file if selected_size is not None: return new_file(file, path, selected_size) else: # Could we already have it stored? hash = utils.hash(file) query = File.query.filter_by(hash=hash) db_file = None try: db_file = query.first() if db_file is None: raise NoResultFound # We did have it, we link a new one to the existing one instead of re-uploading os.link(db_file.path, path) db_file = File(path=path, name=file.filename, uploaded=db_file.uploaded, hash=hash, uploader_id=user_id, language_id=db_file.language_id, lines=db_file.lines, words=db_file.words, chars=db_file.chars) except NoResultFound: db_file = new_file(file, path) return db_file