Exemple #1
0
def evaluate_files():
    mt_files = request.files.getlist('mt_files[]')
    ht_files = request.files.getlist('ht_files[]')
    source_file = request.files.get('source_file')

    line_length = None

    def save_file(file, path, limit=500):
        with open(path, 'w') as output_file:
            for i, line in enumerate(file):
                if i < limit:
                    print(line.decode('utf-8').strip(), file=output_file)

    mt_paths = []
    for mt_file in mt_files:
        mt_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), mt_file.filename))
        save_file(mt_file, mt_path)

        if not line_length:
            line_length = utils.file_length(mt_path)
        elif utils.file_length(mt_path) != line_length:
            return ({"result": "-1"})

        mt_paths.append(mt_path)

    ht_paths = []
    for ht_file in ht_files:
        ht_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), ht_file.filename))
        save_file(ht_file, ht_path)

        if not line_length:
            line_length = utils.file_length(ht_path)
        elif utils.file_length(ht_path) != line_length:
            return ({"result": "-1"})

        ht_paths.append(ht_path)

    if source_file:
        source_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), source_file.filename))
        save_file(source_file, source_path)

        if utils.file_length(ht_path) != utils.file_length(source_path):
            return ({"result": "-1"})

    task = tasks.evaluate_files.apply_async(
        args=[user_utils.get_uid(), mt_paths, ht_paths],
        kwargs={'source_path': source_path if source_file else None})
    return ({"result": 200, "task_id": task.id})
Exemple #2
0
def library_share_toggle(type, id):
    if type == "library_corpora":
        db_resource = Corpus.query.filter_by(owner_id=user_utils.get_uid(),
                                             id=id).first()
        db_resource.public = not db_resource.public
        db.session.commit()
    else:
        db_resource = Engine.query.filter_by(uploader_id=user_utils.get_uid(),
                                             id=id).first()
        db_resource.public = not db_resource.public
        db.session.commit()

    return redirect(request.referrer)
Exemple #3
0
def library_ungrab(type, id):
    user = User.query.filter_by(id=user_utils.get_uid()).first()

    if type == "library_corpora":
        library = LibraryCorpora.query.filter_by(
            corpus_id=id, user_id=user_utils.get_uid()).first()
        user.user_corpora.remove(library)
    else:
        library = LibraryEngine.query.filter_by(
            engine_id=id, user_id=user_utils.get_uid()).first()
        user.user_engines.remove(library)

    db.session.commit()

    return redirect(request.referrer)
Exemple #4
0
def inspect_compare_text():
    line = request.form.get('line')
    engines = request.form.getlist('engines[]')
    translators.set_admin(user_utils.is_admin())
    translation_task_id = translators.get_compare(user_utils.get_uid(), line, engines)

    return translation_task_id
Exemple #5
0
def translate_index():
    engines = LibraryEngine.query.filter_by(user_id = user_utils.get_uid()) \
                .join(Engine, LibraryEngine.engine) \
                .filter(or_(Engine.status == "stopped", Engine.status == "finished", Engine.status == "stopped_admin")) \
                .order_by(Engine.uploaded.desc()) \
                .all()
    return render_template('translate.html.jinja2', page_name='translate_text', page_title='Translate', engines = engines)
Exemple #6
0
def inspect_access():
    engines = LibraryEngine.query.filter_by(user_id = user_utils.get_uid()) \
            .join(Engine, LibraryEngine.engine) \
            .filter(or_(Engine.status == "stopped", Engine.status == "finished", Engine.status == "stopped_admin")) \
            .order_by(Engine.uploaded.desc()) \
            .all()
    return render_template('access.inspect.html.jinja2', page_name='inspect_access', page_title='Access', engines=engines)
Exemple #7
0
def translate_text():
    engine_id = request.form.get('engine_id')
    lines = request.form.getlist('text[]')
    detached = True
    translators.set_admin(user_utils.is_admin())
    translation_task_id = translators.text(user_utils.get_uid(), engine_id, lines)

    return jsonify({ "result": 200, "task_id": translation_task_id })
Exemple #8
0
def inspect_details():
    line = request.form.get('line')
    engine_id = request.form.get('engine_id')
    engines = request.form.getlist('engines[]')
    translators.set_admin(user_utils.is_admin())
    translation_task_id = translators.get_inspect(user_utils.get_uid(), engine_id, line, engines)

    return translation_task_id
Exemple #9
0
def as_tmx():
    engine_id = request.form.get('engine_id')
    chain_engine_id = request.form.get('chain_engine_id')
    chain_engine_id = chain_engine_id if chain_engine_id and chain_engine_id != "false" else None
    text = request.form.getlist('text[]')

    translators.set_admin(user_utils.is_admin())
    translation_task_id = translators.generate_tmx(user_utils.get_uid(), engine_id, chain_engine_id, text)
    return jsonify({ "result": 200, "task_id": translation_task_id })
Exemple #10
0
def train_index():
    if user_utils.is_normal(): return redirect(url_for('index'))

    currently_training = Engine.query.filter_by(uploader_id = user_utils.get_uid()) \
                            .filter(Engine.status.like("training")).all()

    if (len(currently_training) > 0):
        return redirect(
            url_for('train.train_console', id=currently_training[0].id))

    currently_launching = Engine.query.filter_by(uploader_id = user_utils.get_uid()) \
                            .filter(Engine.status.like("launching")).all()

    if (len(currently_launching) > 0):
        return redirect(
            url_for('train.train_launching',
                    task_id=currently_launching[0].bg_task_id))

    random_name = namegenerator.gen()
    tryout = 0
    while len(Engine.query.filter_by(name=random_name).all()):
        random_name = namegenerator.gen()
        tryout += 1

        if tryout >= 5:
            random_name = ""
            break

    random_name = " ".join(random_name.split("-")[:2])

    library_corpora = user_utils.get_user_corpora().filter(
        LibraryCorpora.corpus.has(Corpus.type == "bilingual")).all()
    corpora = [c.corpus for c in library_corpora]
    languages = UserLanguage.query.filter_by(user_id=current_user.id).order_by(
        UserLanguage.name).all()

    return render_template('train.html.jinja2',
                           page_name='train',
                           page_title='Train',
                           corpora=corpora,
                           random_name=random_name,
                           languages=languages)
Exemple #11
0
def upload_file():
    engine_id = request.form.get('engine_id')
    user_file = request.files.get('user_file')
    as_tmx = request.form.get('as_tmx') == 'true'
    tmx_mode = request.form.get('tmx_mode')
    
    key = utils.normname(user_utils.get_uid(), user_file.filename)
    this_upload = user_utils.get_user_folder(key)

    try:
        os.mkdir(this_upload)
    except:
        shutil.rmtree(this_upload)
        os.mkdir(this_upload)
    
    user_file_path = os.path.join(this_upload, secure_filename(user_file.filename))
    user_file.save(user_file_path)

    translators.set_admin(user_utils.is_admin())
    translation_task_id = translators.translate_file(user_utils.get_uid(), engine_id, user_file_path, as_tmx, tmx_mode)

    return jsonify({ "result": 200, "task_id": translation_task_id })
Exemple #12
0
def library_grab(type, id):
    user = User.query.filter_by(id=user_utils.get_uid()).first()

    if type == "library_corpora":
        corpus = Corpus.query.filter_by(id=id).first()
        user.user_corpora.append(LibraryCorpora(corpus=corpus, user=user))
    else:
        engine = Engine.query.filter_by(id=id).first()
        user.user_engines.append(LibraryEngine(engine=engine, user=user))

    db.session.commit()

    return redirect(request.referrer)
Exemple #13
0
def library_engines():
    user_library = User.query.filter_by(
        id=user_utils.get_uid()).first().user_engines
    public_engines = Engine.query.filter_by(public=True)

    user_engines = list(map(lambda l: l.engine, user_library))
    for engine in public_engines:
        engine.grabbed = engine in user_engines

    return render_template('library_engines.html.jinja2',
                           page_name='library_engines',
                           page_title='Engines',
                           user_library=user_library,
                           public_engines=public_engines)
Exemple #14
0
def data_upload_perform():
    if user_utils.is_normal(): return redirect(url_for('index'))

    try:
        if request.method == 'POST':
            task_id = data_utils.process_upload_request(user_utils.get_uid(), request.files.get('bitext_file'), request.files.get('source_file'),
                    request.files.get('target_file'), request.form.get('source_lang'), request.form.get('target_lang'),
                    request.form.get('name'), request.form.get('description'), request.form.get('topic'))

            return jsonify({ "result": 200, "task_id": task_id })
        else:
            raise Exception("Sorry, but we couldn't handle your request.")
    except Exception as e:
        Flash.issue(e, Flash.ERROR)

    return jsonify({ "result": -1 })
Exemple #15
0
def join_corpus_files(corpus, shuffle=False, user_id=None):
    # If a corpus has several source and target files, we need to put their contents in
    # a single file. This method shuffles and prints the contents to a new file
    user_id = user_id if user_id else user_utils.get_uid()

    source_single_file = File(path=os.path.join(
        app.config['FILES_FOLDER'], 'mut.{}.single.src'.format(corpus.id)),
                              name='mut.{}.single.src'.format(corpus.id),
                              uploader_id=user_id,
                              uploaded=datetime.datetime.utcnow())

    target_single_file = File(path=os.path.join(
        app.config['FILES_FOLDER'], 'mut.{}.single.trg'.format(corpus.id)),
                              name='mut.{}.single.trg'.format(corpus.id),
                              uploader_id=user_id,
                              uploaded=datetime.datetime.utcnow())

    def dump_files(files, single_file_db):
        with open(single_file_db.path, 'w') as single_file:
            for file_entry in files:
                with open(file_entry.file.path, 'r') as corpus_file:
                    for line in corpus_file:
                        single_file.write(line)

                os.remove(file_entry.file.path)

                db.session.delete(file_entry.file)
                corpus.corpus_files.remove(file_entry)
                db.session.commit()

    dump_files([f for f in corpus.corpus_files if f.role == "source"],
               source_single_file)
    dump_files([f for f in corpus.corpus_files if f.role == "target"],
               target_single_file)

    corpus.corpus_files.append(Corpus_File(source_single_file, role="source"))
    corpus.corpus_files.append(Corpus_File(target_single_file, role="target"))
    db.session.commit()

    if shuffle: shuffle_sentences(corpus)

    return corpus
Exemple #16
0
def train_start():
    if user_utils.is_normal(): return url_for('index')
    engine_path = os.path.join(
        user_utils.get_user_folder("engines"),
        utils.normname(user_utils.get_user().username,
                       request.form['nameText']))
    task = tasks.launch_training.apply_async(args=[
        user_utils.get_uid(), engine_path,
        {
            i[0]: i[1] if i[0].endswith('[]') else i[1][0]
            for i in request.form.lists()
        }
    ])

    return jsonify({
        "result":
        200,
        "launching_url":
        url_for('train.train_launching', task_id=task.id)
    })
Exemple #17
0
def delete_user():
    id = request.args.get('id')

    try:
        assert int(id) != user_utils.get_uid()
        user = User.query.filter_by(id=id).first()
        for corpus in Corpus.query.filter_by(owner_id=id).all():
            user_utils.library_delete("library_corpora", corpus.id, id)

        for engine_entry in user.user_engines:
            user_utils.library_delete("library_engines",
                                      engine_entry.engine.id, id)

        shutil.rmtree(user_utils.get_user_folder(user_id=id))
        db.session.delete(user)
        db.session.commit()
    except:
        pass

    return redirect(request.referrer)
Exemple #18
0
def library_engines_feed():
    public = request.form.get('public') == "true"
    columns = [
        Engine.id, Engine.name, Engine.description, Engine.source_id,
        Engine.uploaded, Engine.uploader_id, None
    ]
    dt = datatables.Datatables()

    rows, rows_filtered, search = dt.parse(
        Engine, columns, request,
        and_(
            Engine.public == True,
            not_(
                Engine.engine_users.any(
                    LibraryEngine.user_id == user_utils.get_uid())))
        if public else Engine.engine_users.any(
            LibraryEngine.user_id == user_utils.get_uid()))

    engine_data = []
    for engine in (rows_filtered if search else rows):
        # We try to get BLEU score for this engine
        score = None
        try:
            with open(os.path.join(engine.path, "model/train.log"),
                      'r') as log_file:
                for line in log_file:
                    groups = re.search(training_log.validation_regex,
                                       line,
                                       flags=training_log.re_flags)
                    if groups:
                        bleu_score = float(groups[6])
                        score = bleu_score if score is None or bleu_score > score else score
        except IOError:
            pass

        uploaded_date = datetime.fromtimestamp(
            datetime.timestamp(engine.uploaded)).strftime("%d/%m/%Y")
        engine_data.append([
            engine.id, engine.name, engine.description,
            "{} — {}".format(engine.source.name,
                             engine.target.name), uploaded_date,
            engine.uploader.username if engine.uploader else "MutNMT", score,
            "", {
                "engine_owner":
                engine.uploader.id == user_utils.get_uid()
                if engine.uploader else False,
                "engine_public":
                engine.public,
                "engine_share":
                url_for('library.library_share_toggle',
                        type="library_engines",
                        id=engine.id),
                "engine_summary":
                url_for('train.train_console', id=engine.id),
                "engine_delete":
                url_for('library.library_delete',
                        id=engine.id,
                        type="library_engines"),
                "engine_grab":
                url_for('library.library_grab',
                        id=engine.id,
                        type="library_engines"),
                "engine_ungrab":
                url_for('library.library_ungrab',
                        id=engine.id,
                        type="library_engines"),
                "engine_export":
                url_for('library.library_export',
                        id=engine.id,
                        type="library_engines"),
                "engine_corpora_export":
                url_for('library.library_corpora_export', id=engine.id)
            }
        ])

        order = int(request.form.get('order[0][column]'))
        direction = request.form.get('order[0][dir]')
        if order == 6:
            # Order by bleu
            engine_data.sort(key=lambda c: c[order] if c[order] else 0,
                             reverse=(direction == 'asc'))

    return dt.response(rows, rows_filtered, engine_data)
Exemple #19
0
def data_upload_perform():
    if user_utils.is_normal(): return redirect(url_for('index'))

    try:
        if request.method == 'POST':

            # Handle possible custom languages
            def add_custom_language(code, name):
                custom_language = UserLanguage.query.filter_by(
                    code=code, user_id=current_user.id).first()

                if custom_language:
                    custom_language.name = custom_src_lang_name
                    db.session.commit()
                else:
                    custom_language = UserLanguage(code=code,
                                                   name=name,
                                                   user_id=current_user.id)
                    db.session.add(custom_language)
                    db.session.commit()

                return UserLanguage.query.filter_by(
                    code=code, user_id=current_user.id).first()

            source_lang = request.form.get('source_lang')
            target_lang = request.form.get('target_lang')

            custom_src_lang_code = request.form.get('sourceCustomLangCode')
            custom_trg_lang_code = request.form.get('targetCustomLangCode')

            if custom_src_lang_code:
                custom_src_lang_name = request.form.get('sourceCustomLangName')
                custom_lang = add_custom_language(custom_src_lang_code,
                                                  custom_src_lang_name)

                source_lang = custom_lang.id
            else:
                source_lang = UserLanguage.query.filter_by(
                    code=source_lang, user_id=current_user.id).one().id

            if custom_trg_lang_code:
                custom_trg_lang_name = request.form.get('targetCustomLangName')
                custom_lang = add_custom_language(custom_trg_lang_code,
                                                  custom_trg_lang_name)

                target_lang = custom_lang.id
            else:
                target_lang = UserLanguage.query.filter_by(
                    code=target_lang, user_id=current_user.id).one().id

            task_id = data_utils.process_upload_request(
                user_utils.get_uid(), request.files.get('bitext_file'),
                request.files.get('source_file'),
                request.files.get('target_file'), source_lang, target_lang,
                request.form.get('name'), request.form.get('description'),
                request.form.get('topic'))

            return jsonify({"result": 200, "task_id": task_id})
        else:
            raise Exception("Sorry, but we couldn't handle your request.")
    except Exception as e:
        Flash.issue(e, Flash.ERROR)

    return jsonify({"result": -1})
Exemple #20
0
def translate_leave():
    translators.deattach(user_utils.get_uid())
    return "0"
Exemple #21
0
def library_corpora_feed():
    public = request.form.get('public') == "true"

    if public:
        library_objects = user_utils.get_user_corpora(public=True).all()
    else:
        library_objects = user_utils.get_user_corpora().all()

    user_library = [lc.corpus for lc in library_objects]

    # We are not using the datatables helper since this is an specific case
    # and we need more control to group corpora

    draw = int(request.form.get('draw'))
    search = request.form.get('search[value]')
    start = int(request.form.get('start'))
    length = int(request.form.get('length'))
    order = int(request.form.get('order[0][column]'))
    dir = request.form.get('order[0][dir]')

    corpus_rows = []
    for corpus in user_library:
        corpus_rows.append([
            corpus.id, corpus.name,
            corpus.source.name + (corpus.target.name if corpus.target else ""),
            corpus.lines(),
            corpus.words(),
            corpus.chars(),
            corpus.uploaded()
        ])

    recordsTotal = len(corpus_rows)
    recordsFiltered = 0

    if order:
        corpus_rows.sort(key=lambda c: c[order], reverse=(dir == 'asc'))

    if start is not None and length is not None:
        corpus_rows = corpus_rows[start:(start + length)]

    corpus_data = []
    for row in corpus_rows:
        corpus = Corpus.query.filter_by(id=row[0]).first()

        file_entries = corpus.corpus_files
        file_entries.sort(key=lambda f: f.role)

        file_data = []
        for file_entry in file_entries:
            file = file_entry.file

            uploaded_date = datetime.fromtimestamp(
                datetime.timestamp(file.uploaded)).strftime("%d/%m/%Y")
            file_data.append([
                file.id, file.name, file.language.name,
                utils.format_number(file.lines),
                utils.format_number(file.words),
                corpus.topic.name if corpus.topic else "", uploaded_date, {
                    "corpus_owner":
                    file.uploader.id == user_utils.get_uid()
                    if file.uploader else False,
                    "corpus_uploader":
                    file.uploader.username if file.uploader else "MutNMT",
                    "corpus_id":
                    corpus.id,
                    "corpus_name":
                    corpus.name,
                    "corpus_description":
                    corpus.description,
                    "corpus_source":
                    corpus.source.name,
                    "corpus_target":
                    corpus.target.name if corpus.target else "",
                    "corpus_public":
                    corpus.public,
                    "corpus_size":
                    corpus.corpus_files[0].file.lines,
                    "corpus_preview":
                    url_for('library.corpora_preview', id=corpus.id),
                    "corpus_share":
                    url_for('library.library_share_toggle',
                            type='library_corpora',
                            id=corpus.id),
                    "corpus_delete":
                    url_for('library.library_delete',
                            id=corpus.id,
                            type='library_corpora'),
                    "corpus_grab":
                    url_for('library.library_grab',
                            id=corpus.id,
                            type='library_corpora'),
                    "corpus_ungrab":
                    url_for('library.library_ungrab',
                            id=corpus.id,
                            type='library_corpora'),
                    "corpus_export":
                    url_for('library.library_export',
                            id=corpus.id,
                            type="library_corpora"),
                    "file_preview":
                    url_for('data.data_preview', file_id=file.id)
                }
            ])

        if search:
            found = False
            for col in row + file_data:
                found = found or (search.lower() in str(col).lower())

            if found:
                corpus_data = corpus_data + file_data
                recordsFiltered += 1
        else:
            corpus_data = corpus_data + file_data

    return jsonify({
        "draw": draw + 1,
        "recordsTotal": recordsTotal,
        "recordsFiltered": recordsFiltered if search else recordsTotal,
        "data": corpus_data
    })
Exemple #22
0
def upload_file(file,
                language,
                format="text",
                selected_size=None,
                offset=None,
                user_id=None):
    user_id = user_id if user_id else user_utils.get_uid()
    norm_name = utils.normname(user_id=user_id, filename=file.filename)
    path = utils.filepath('FILES_FOLDER', norm_name)

    def new_file(file, path, selected_size=None):
        # We save it
        file.seek(0)
        file.save(path)

        # Convert whatever format this has to UTF-8
        convert_file_to_utf8(path)
        fix_file(path)

        hash = utils.hash(file)

        if selected_size is not None:
            # We get the amount of sentences we want
            crop_path = "{}.crop".format(path)

            if offset:
                crop_proccess = subprocess.Popen(
                    "cat {} "
                    "| head -n {} "
                    "| tail -n {} > {}".format(
                        path,
                        int(offset) + int(selected_size), selected_size,
                        crop_path),
                    shell=True)
                crop_proccess.wait()
            else:
                crop_proccess = subprocess.Popen(
                    "cat {} | head -n {} > {}".format(path, selected_size,
                                                      crop_path),
                    shell=True)
                crop_proccess.wait()

            os.remove(path)
            shutil.move(crop_path, path)

            with open(path, 'r') as crop_file:
                hash = utils.hash(crop_file)

        # Get file stats
        wc_output = subprocess.check_output('wc -lwc {}'.format(path),
                                            shell=True)
        wc_output_search = re.search(r'^(\s*)(\d+)(\s+)(\d+)(\s+)(\d+)(.*)$',
                                     wc_output.decode("utf-8"))
        lines, words, chars = wc_output_search.group(
            2), wc_output_search.group(4), wc_output_search.group(6)

        # Save in DB
        db_file = File(path=path,
                       name=file.filename,
                       user_language_id=language,
                       hash=hash,
                       uploader_id=user_id,
                       lines=lines,
                       words=words,
                       chars=chars,
                       uploaded=datetime.datetime.utcnow())

        return db_file

    if selected_size is not None:
        return new_file(file, path, selected_size)
    else:
        # Could we already have it stored?
        hash = utils.hash(file)

        query = File.query.filter_by(hash=hash)
        db_file = None

        try:
            db_file = query.first()
            if db_file is None: raise NoResultFound

            # We did have it, we link a new one to the existing one instead of re-uploading
            os.link(db_file.path, path)

            db_file = File(path=path,
                           name=file.filename,
                           uploaded=db_file.uploaded,
                           hash=hash,
                           uploader_id=user_id,
                           language_id=db_file.language_id,
                           lines=db_file.lines,
                           words=db_file.words,
                           chars=db_file.chars)

        except NoResultFound:
            db_file = new_file(file, path)

        return db_file