def evaluate_files(): mt_files = request.files.getlist('mt_files[]') ht_files = request.files.getlist('ht_files[]') source_file = request.files.get('source_file') line_length = None def save_file(file, path, limit=500): with open(path, 'w') as output_file: for i, line in enumerate(file): if i < limit: print(line.decode('utf-8').strip(), file=output_file) mt_paths = [] for mt_file in mt_files: mt_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), mt_file.filename)) save_file(mt_file, mt_path) if not line_length: line_length = utils.file_length(mt_path) elif utils.file_length(mt_path) != line_length: return ({"result": "-1"}) mt_paths.append(mt_path) ht_paths = [] for ht_file in ht_files: ht_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), ht_file.filename)) save_file(ht_file, ht_path) if not line_length: line_length = utils.file_length(ht_path) elif utils.file_length(ht_path) != line_length: return ({"result": "-1"}) ht_paths.append(ht_path) if source_file: source_path = utils.filepath( 'FILES_FOLDER', utils.normname(user_utils.get_uid(), source_file.filename)) save_file(source_file, source_path) if utils.file_length(ht_path) != utils.file_length(source_path): return ({"result": "-1"}) task = tasks.evaluate_files.apply_async( args=[user_utils.get_uid(), mt_paths, ht_paths], kwargs={'source_path': source_path if source_file else None}) return ({"result": 200, "task_id": task.id})
def library_share_toggle(type, id): if type == "library_corpora": db_resource = Corpus.query.filter_by(owner_id=user_utils.get_uid(), id=id).first() db_resource.public = not db_resource.public db.session.commit() else: db_resource = Engine.query.filter_by(uploader_id=user_utils.get_uid(), id=id).first() db_resource.public = not db_resource.public db.session.commit() return redirect(request.referrer)
def library_ungrab(type, id): user = User.query.filter_by(id=user_utils.get_uid()).first() if type == "library_corpora": library = LibraryCorpora.query.filter_by( corpus_id=id, user_id=user_utils.get_uid()).first() user.user_corpora.remove(library) else: library = LibraryEngine.query.filter_by( engine_id=id, user_id=user_utils.get_uid()).first() user.user_engines.remove(library) db.session.commit() return redirect(request.referrer)
def inspect_compare_text(): line = request.form.get('line') engines = request.form.getlist('engines[]') translators.set_admin(user_utils.is_admin()) translation_task_id = translators.get_compare(user_utils.get_uid(), line, engines) return translation_task_id
def translate_index(): engines = LibraryEngine.query.filter_by(user_id = user_utils.get_uid()) \ .join(Engine, LibraryEngine.engine) \ .filter(or_(Engine.status == "stopped", Engine.status == "finished", Engine.status == "stopped_admin")) \ .order_by(Engine.uploaded.desc()) \ .all() return render_template('translate.html.jinja2', page_name='translate_text', page_title='Translate', engines = engines)
def inspect_access(): engines = LibraryEngine.query.filter_by(user_id = user_utils.get_uid()) \ .join(Engine, LibraryEngine.engine) \ .filter(or_(Engine.status == "stopped", Engine.status == "finished", Engine.status == "stopped_admin")) \ .order_by(Engine.uploaded.desc()) \ .all() return render_template('access.inspect.html.jinja2', page_name='inspect_access', page_title='Access', engines=engines)
def translate_text(): engine_id = request.form.get('engine_id') lines = request.form.getlist('text[]') detached = True translators.set_admin(user_utils.is_admin()) translation_task_id = translators.text(user_utils.get_uid(), engine_id, lines) return jsonify({ "result": 200, "task_id": translation_task_id })
def inspect_details(): line = request.form.get('line') engine_id = request.form.get('engine_id') engines = request.form.getlist('engines[]') translators.set_admin(user_utils.is_admin()) translation_task_id = translators.get_inspect(user_utils.get_uid(), engine_id, line, engines) return translation_task_id
def as_tmx(): engine_id = request.form.get('engine_id') chain_engine_id = request.form.get('chain_engine_id') chain_engine_id = chain_engine_id if chain_engine_id and chain_engine_id != "false" else None text = request.form.getlist('text[]') translators.set_admin(user_utils.is_admin()) translation_task_id = translators.generate_tmx(user_utils.get_uid(), engine_id, chain_engine_id, text) return jsonify({ "result": 200, "task_id": translation_task_id })
def train_index(): if user_utils.is_normal(): return redirect(url_for('index')) currently_training = Engine.query.filter_by(uploader_id = user_utils.get_uid()) \ .filter(Engine.status.like("training")).all() if (len(currently_training) > 0): return redirect( url_for('train.train_console', id=currently_training[0].id)) currently_launching = Engine.query.filter_by(uploader_id = user_utils.get_uid()) \ .filter(Engine.status.like("launching")).all() if (len(currently_launching) > 0): return redirect( url_for('train.train_launching', task_id=currently_launching[0].bg_task_id)) random_name = namegenerator.gen() tryout = 0 while len(Engine.query.filter_by(name=random_name).all()): random_name = namegenerator.gen() tryout += 1 if tryout >= 5: random_name = "" break random_name = " ".join(random_name.split("-")[:2]) library_corpora = user_utils.get_user_corpora().filter( LibraryCorpora.corpus.has(Corpus.type == "bilingual")).all() corpora = [c.corpus for c in library_corpora] languages = UserLanguage.query.filter_by(user_id=current_user.id).order_by( UserLanguage.name).all() return render_template('train.html.jinja2', page_name='train', page_title='Train', corpora=corpora, random_name=random_name, languages=languages)
def upload_file(): engine_id = request.form.get('engine_id') user_file = request.files.get('user_file') as_tmx = request.form.get('as_tmx') == 'true' tmx_mode = request.form.get('tmx_mode') key = utils.normname(user_utils.get_uid(), user_file.filename) this_upload = user_utils.get_user_folder(key) try: os.mkdir(this_upload) except: shutil.rmtree(this_upload) os.mkdir(this_upload) user_file_path = os.path.join(this_upload, secure_filename(user_file.filename)) user_file.save(user_file_path) translators.set_admin(user_utils.is_admin()) translation_task_id = translators.translate_file(user_utils.get_uid(), engine_id, user_file_path, as_tmx, tmx_mode) return jsonify({ "result": 200, "task_id": translation_task_id })
def library_grab(type, id): user = User.query.filter_by(id=user_utils.get_uid()).first() if type == "library_corpora": corpus = Corpus.query.filter_by(id=id).first() user.user_corpora.append(LibraryCorpora(corpus=corpus, user=user)) else: engine = Engine.query.filter_by(id=id).first() user.user_engines.append(LibraryEngine(engine=engine, user=user)) db.session.commit() return redirect(request.referrer)
def library_engines(): user_library = User.query.filter_by( id=user_utils.get_uid()).first().user_engines public_engines = Engine.query.filter_by(public=True) user_engines = list(map(lambda l: l.engine, user_library)) for engine in public_engines: engine.grabbed = engine in user_engines return render_template('library_engines.html.jinja2', page_name='library_engines', page_title='Engines', user_library=user_library, public_engines=public_engines)
def data_upload_perform(): if user_utils.is_normal(): return redirect(url_for('index')) try: if request.method == 'POST': task_id = data_utils.process_upload_request(user_utils.get_uid(), request.files.get('bitext_file'), request.files.get('source_file'), request.files.get('target_file'), request.form.get('source_lang'), request.form.get('target_lang'), request.form.get('name'), request.form.get('description'), request.form.get('topic')) return jsonify({ "result": 200, "task_id": task_id }) else: raise Exception("Sorry, but we couldn't handle your request.") except Exception as e: Flash.issue(e, Flash.ERROR) return jsonify({ "result": -1 })
def join_corpus_files(corpus, shuffle=False, user_id=None): # If a corpus has several source and target files, we need to put their contents in # a single file. This method shuffles and prints the contents to a new file user_id = user_id if user_id else user_utils.get_uid() source_single_file = File(path=os.path.join( app.config['FILES_FOLDER'], 'mut.{}.single.src'.format(corpus.id)), name='mut.{}.single.src'.format(corpus.id), uploader_id=user_id, uploaded=datetime.datetime.utcnow()) target_single_file = File(path=os.path.join( app.config['FILES_FOLDER'], 'mut.{}.single.trg'.format(corpus.id)), name='mut.{}.single.trg'.format(corpus.id), uploader_id=user_id, uploaded=datetime.datetime.utcnow()) def dump_files(files, single_file_db): with open(single_file_db.path, 'w') as single_file: for file_entry in files: with open(file_entry.file.path, 'r') as corpus_file: for line in corpus_file: single_file.write(line) os.remove(file_entry.file.path) db.session.delete(file_entry.file) corpus.corpus_files.remove(file_entry) db.session.commit() dump_files([f for f in corpus.corpus_files if f.role == "source"], source_single_file) dump_files([f for f in corpus.corpus_files if f.role == "target"], target_single_file) corpus.corpus_files.append(Corpus_File(source_single_file, role="source")) corpus.corpus_files.append(Corpus_File(target_single_file, role="target")) db.session.commit() if shuffle: shuffle_sentences(corpus) return corpus
def train_start(): if user_utils.is_normal(): return url_for('index') engine_path = os.path.join( user_utils.get_user_folder("engines"), utils.normname(user_utils.get_user().username, request.form['nameText'])) task = tasks.launch_training.apply_async(args=[ user_utils.get_uid(), engine_path, { i[0]: i[1] if i[0].endswith('[]') else i[1][0] for i in request.form.lists() } ]) return jsonify({ "result": 200, "launching_url": url_for('train.train_launching', task_id=task.id) })
def delete_user(): id = request.args.get('id') try: assert int(id) != user_utils.get_uid() user = User.query.filter_by(id=id).first() for corpus in Corpus.query.filter_by(owner_id=id).all(): user_utils.library_delete("library_corpora", corpus.id, id) for engine_entry in user.user_engines: user_utils.library_delete("library_engines", engine_entry.engine.id, id) shutil.rmtree(user_utils.get_user_folder(user_id=id)) db.session.delete(user) db.session.commit() except: pass return redirect(request.referrer)
def library_engines_feed(): public = request.form.get('public') == "true" columns = [ Engine.id, Engine.name, Engine.description, Engine.source_id, Engine.uploaded, Engine.uploader_id, None ] dt = datatables.Datatables() rows, rows_filtered, search = dt.parse( Engine, columns, request, and_( Engine.public == True, not_( Engine.engine_users.any( LibraryEngine.user_id == user_utils.get_uid()))) if public else Engine.engine_users.any( LibraryEngine.user_id == user_utils.get_uid())) engine_data = [] for engine in (rows_filtered if search else rows): # We try to get BLEU score for this engine score = None try: with open(os.path.join(engine.path, "model/train.log"), 'r') as log_file: for line in log_file: groups = re.search(training_log.validation_regex, line, flags=training_log.re_flags) if groups: bleu_score = float(groups[6]) score = bleu_score if score is None or bleu_score > score else score except IOError: pass uploaded_date = datetime.fromtimestamp( datetime.timestamp(engine.uploaded)).strftime("%d/%m/%Y") engine_data.append([ engine.id, engine.name, engine.description, "{} — {}".format(engine.source.name, engine.target.name), uploaded_date, engine.uploader.username if engine.uploader else "MutNMT", score, "", { "engine_owner": engine.uploader.id == user_utils.get_uid() if engine.uploader else False, "engine_public": engine.public, "engine_share": url_for('library.library_share_toggle', type="library_engines", id=engine.id), "engine_summary": url_for('train.train_console', id=engine.id), "engine_delete": url_for('library.library_delete', id=engine.id, type="library_engines"), "engine_grab": url_for('library.library_grab', id=engine.id, type="library_engines"), "engine_ungrab": url_for('library.library_ungrab', id=engine.id, type="library_engines"), "engine_export": url_for('library.library_export', id=engine.id, type="library_engines"), "engine_corpora_export": url_for('library.library_corpora_export', id=engine.id) } ]) order = int(request.form.get('order[0][column]')) direction = request.form.get('order[0][dir]') if order == 6: # Order by bleu engine_data.sort(key=lambda c: c[order] if c[order] else 0, reverse=(direction == 'asc')) return dt.response(rows, rows_filtered, engine_data)
def data_upload_perform(): if user_utils.is_normal(): return redirect(url_for('index')) try: if request.method == 'POST': # Handle possible custom languages def add_custom_language(code, name): custom_language = UserLanguage.query.filter_by( code=code, user_id=current_user.id).first() if custom_language: custom_language.name = custom_src_lang_name db.session.commit() else: custom_language = UserLanguage(code=code, name=name, user_id=current_user.id) db.session.add(custom_language) db.session.commit() return UserLanguage.query.filter_by( code=code, user_id=current_user.id).first() source_lang = request.form.get('source_lang') target_lang = request.form.get('target_lang') custom_src_lang_code = request.form.get('sourceCustomLangCode') custom_trg_lang_code = request.form.get('targetCustomLangCode') if custom_src_lang_code: custom_src_lang_name = request.form.get('sourceCustomLangName') custom_lang = add_custom_language(custom_src_lang_code, custom_src_lang_name) source_lang = custom_lang.id else: source_lang = UserLanguage.query.filter_by( code=source_lang, user_id=current_user.id).one().id if custom_trg_lang_code: custom_trg_lang_name = request.form.get('targetCustomLangName') custom_lang = add_custom_language(custom_trg_lang_code, custom_trg_lang_name) target_lang = custom_lang.id else: target_lang = UserLanguage.query.filter_by( code=target_lang, user_id=current_user.id).one().id task_id = data_utils.process_upload_request( user_utils.get_uid(), request.files.get('bitext_file'), request.files.get('source_file'), request.files.get('target_file'), source_lang, target_lang, request.form.get('name'), request.form.get('description'), request.form.get('topic')) return jsonify({"result": 200, "task_id": task_id}) else: raise Exception("Sorry, but we couldn't handle your request.") except Exception as e: Flash.issue(e, Flash.ERROR) return jsonify({"result": -1})
def translate_leave(): translators.deattach(user_utils.get_uid()) return "0"
def library_corpora_feed(): public = request.form.get('public') == "true" if public: library_objects = user_utils.get_user_corpora(public=True).all() else: library_objects = user_utils.get_user_corpora().all() user_library = [lc.corpus for lc in library_objects] # We are not using the datatables helper since this is an specific case # and we need more control to group corpora draw = int(request.form.get('draw')) search = request.form.get('search[value]') start = int(request.form.get('start')) length = int(request.form.get('length')) order = int(request.form.get('order[0][column]')) dir = request.form.get('order[0][dir]') corpus_rows = [] for corpus in user_library: corpus_rows.append([ corpus.id, corpus.name, corpus.source.name + (corpus.target.name if corpus.target else ""), corpus.lines(), corpus.words(), corpus.chars(), corpus.uploaded() ]) recordsTotal = len(corpus_rows) recordsFiltered = 0 if order: corpus_rows.sort(key=lambda c: c[order], reverse=(dir == 'asc')) if start is not None and length is not None: corpus_rows = corpus_rows[start:(start + length)] corpus_data = [] for row in corpus_rows: corpus = Corpus.query.filter_by(id=row[0]).first() file_entries = corpus.corpus_files file_entries.sort(key=lambda f: f.role) file_data = [] for file_entry in file_entries: file = file_entry.file uploaded_date = datetime.fromtimestamp( datetime.timestamp(file.uploaded)).strftime("%d/%m/%Y") file_data.append([ file.id, file.name, file.language.name, utils.format_number(file.lines), utils.format_number(file.words), corpus.topic.name if corpus.topic else "", uploaded_date, { "corpus_owner": file.uploader.id == user_utils.get_uid() if file.uploader else False, "corpus_uploader": file.uploader.username if file.uploader else "MutNMT", "corpus_id": corpus.id, "corpus_name": corpus.name, "corpus_description": corpus.description, "corpus_source": corpus.source.name, "corpus_target": corpus.target.name if corpus.target else "", "corpus_public": corpus.public, "corpus_size": corpus.corpus_files[0].file.lines, "corpus_preview": url_for('library.corpora_preview', id=corpus.id), "corpus_share": url_for('library.library_share_toggle', type='library_corpora', id=corpus.id), "corpus_delete": url_for('library.library_delete', id=corpus.id, type='library_corpora'), "corpus_grab": url_for('library.library_grab', id=corpus.id, type='library_corpora'), "corpus_ungrab": url_for('library.library_ungrab', id=corpus.id, type='library_corpora'), "corpus_export": url_for('library.library_export', id=corpus.id, type="library_corpora"), "file_preview": url_for('data.data_preview', file_id=file.id) } ]) if search: found = False for col in row + file_data: found = found or (search.lower() in str(col).lower()) if found: corpus_data = corpus_data + file_data recordsFiltered += 1 else: corpus_data = corpus_data + file_data return jsonify({ "draw": draw + 1, "recordsTotal": recordsTotal, "recordsFiltered": recordsFiltered if search else recordsTotal, "data": corpus_data })
def upload_file(file, language, format="text", selected_size=None, offset=None, user_id=None): user_id = user_id if user_id else user_utils.get_uid() norm_name = utils.normname(user_id=user_id, filename=file.filename) path = utils.filepath('FILES_FOLDER', norm_name) def new_file(file, path, selected_size=None): # We save it file.seek(0) file.save(path) # Convert whatever format this has to UTF-8 convert_file_to_utf8(path) fix_file(path) hash = utils.hash(file) if selected_size is not None: # We get the amount of sentences we want crop_path = "{}.crop".format(path) if offset: crop_proccess = subprocess.Popen( "cat {} " "| head -n {} " "| tail -n {} > {}".format( path, int(offset) + int(selected_size), selected_size, crop_path), shell=True) crop_proccess.wait() else: crop_proccess = subprocess.Popen( "cat {} | head -n {} > {}".format(path, selected_size, crop_path), shell=True) crop_proccess.wait() os.remove(path) shutil.move(crop_path, path) with open(path, 'r') as crop_file: hash = utils.hash(crop_file) # Get file stats wc_output = subprocess.check_output('wc -lwc {}'.format(path), shell=True) wc_output_search = re.search(r'^(\s*)(\d+)(\s+)(\d+)(\s+)(\d+)(.*)$', wc_output.decode("utf-8")) lines, words, chars = wc_output_search.group( 2), wc_output_search.group(4), wc_output_search.group(6) # Save in DB db_file = File(path=path, name=file.filename, user_language_id=language, hash=hash, uploader_id=user_id, lines=lines, words=words, chars=chars, uploaded=datetime.datetime.utcnow()) return db_file if selected_size is not None: return new_file(file, path, selected_size) else: # Could we already have it stored? hash = utils.hash(file) query = File.query.filter_by(hash=hash) db_file = None try: db_file = query.first() if db_file is None: raise NoResultFound # We did have it, we link a new one to the existing one instead of re-uploading os.link(db_file.path, path) db_file = File(path=path, name=file.filename, uploaded=db_file.uploaded, hash=hash, uploader_id=user_id, language_id=db_file.language_id, lines=db_file.lines, words=db_file.words, chars=db_file.chars) except NoResultFound: db_file = new_file(file, path) return db_file