예제 #1
0
def evaluate_files():
    mt_files = request.files.getlist('mt_files[]')
    ht_files = request.files.getlist('ht_files[]')
    source_file = request.files.get('source_file')

    line_length = None

    def save_file(file, path, limit=500):
        with open(path, 'w') as output_file:
            for i, line in enumerate(file):
                if i < limit:
                    print(line.decode('utf-8').strip(), file=output_file)

    mt_paths = []
    for mt_file in mt_files:
        mt_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), mt_file.filename))
        save_file(mt_file, mt_path)

        if not line_length:
            line_length = utils.file_length(mt_path)
        elif utils.file_length(mt_path) != line_length:
            return ({"result": "-1"})

        mt_paths.append(mt_path)

    ht_paths = []
    for ht_file in ht_files:
        ht_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), ht_file.filename))
        save_file(ht_file, ht_path)

        if not line_length:
            line_length = utils.file_length(ht_path)
        elif utils.file_length(ht_path) != line_length:
            return ({"result": "-1"})

        ht_paths.append(ht_path)

    if source_file:
        source_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), source_file.filename))
        save_file(source_file, source_path)

        if utils.file_length(ht_path) != utils.file_length(source_path):
            return ({"result": "-1"})

    task = tasks.evaluate_files.apply_async(
        args=[user_utils.get_uid(), mt_paths, ht_paths],
        kwargs={'source_path': source_path if source_file else None})
    return ({"result": 200, "task_id": task.id})
예제 #2
0
def generate_xlsx(user_id, rows, ht_path_index):
    file_name = utils.normname(user_id, "evaluation") + ".xlsx"
    file_path = utils.tmpfile(file_name)

    workbook = xlsxwriter.Workbook(file_path)
    worksheet = workbook.add_worksheet()

    x_rows = []
    for i, row in enumerate(rows):
        x_row = [i + 1]

        if len(row) > 6:
            x_row = [i + 1, row[6]]

        for mt_data in row[5]:
            x_row.append(mt_data['text'])

        x_row.append(row[1])

        for mt_data in row[5]:
            x_row.append(mt_data['bleu'])

        for mt_data in row[5]:
            x_row.append(mt_data['ter'])

        x_rows.append(x_row)

    headers = ["Line"]
    headers = headers + (["Source sentence"] if len(row) > 6 else [])
    headers = headers + [
        "Machine translation {}".format(i + 1) for i in range(len(row[5]))
    ]
    headers = headers + ["Reference {}".format(ht_path_index + 1)]

    headers = headers + ["Bleu MT{}".format(i + 1) for i in range(len(row[5]))]
    headers = headers + ["TER MT{}".format(i + 1) for i in range(len(row[5]))]

    x_rows = [headers] + x_rows

    row_cursor = 0
    for row in x_rows:
        for col_cursor, col in enumerate(row):
            worksheet.write(row_cursor, col_cursor, col)
        row_cursor += 1

    workbook.close()

    return file_path
예제 #3
0
파일: views.py 프로젝트: Prompsit/mutnmt
def train_start():
    if user_utils.is_normal(): return url_for('index')
    engine_path = os.path.join(
        user_utils.get_user_folder("engines"),
        utils.normname(user_utils.get_user().username,
                       request.form['nameText']))
    task = tasks.launch_training.apply_async(args=[
        user_utils.get_uid(), engine_path,
        {
            i[0]: i[1] if i[0].endswith('[]') else i[1][0]
            for i in request.form.lists()
        }
    ])

    return jsonify({
        "result":
        200,
        "launching_url":
        url_for('train.train_launching', task_id=task.id)
    })
예제 #4
0
    def process_bitext(file):
        file_name, file_extension = os.path.splitext(file.filename)
        norm_name = utils.normname(user_id=user_id, filename=file_name)
        tmp_file_fd, tmp_path = utils.tmpfile()
        file.save(tmp_path)

        if file_extension == ".tmx":
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmx_file:
                tmx = etree.parse(tmx_file, etree.XMLParser())
                body = tmx.getroot().find("body")

                for tu in body.findall('.//tu'):
                    for i, tuv in enumerate(tu.findall('.//tuv')):
                        if i > 1: break
                        line = tuv.find("seg").text.strip()
                        line = re.sub(r'[\r\n\t\f\v]', " ", line)
                        dest_file = src_file if i == 0 else trg_file

                        dest_file.write(line.encode('utf-8'))
                        dest_file.write(os.linesep.encode('utf-8'))
        else:
            # We assume it is a TSV
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmp_file:
                for line in tmp_file:
                    cols = line.strip().split('\t')
                    src_file.write((cols[0] + '\n').encode('utf-8'))
                    trg_file.write((cols[1] + '\n').encode('utf-8'))

        src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"),
                        'rb')
        trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"),
                        'rb')

        return FileStorage(src_file, filename=file.filename + "-src"), \
                FileStorage(trg_file, filename=file.filename + "-trg")
예제 #5
0
def upload_file():
    engine_id = request.form.get('engine_id')
    user_file = request.files.get('user_file')
    as_tmx = request.form.get('as_tmx') == 'true'
    tmx_mode = request.form.get('tmx_mode')
    
    key = utils.normname(user_utils.get_uid(), user_file.filename)
    this_upload = user_utils.get_user_folder(key)

    try:
        os.mkdir(this_upload)
    except:
        shutil.rmtree(this_upload)
        os.mkdir(this_upload)
    
    user_file_path = os.path.join(this_upload, secure_filename(user_file.filename))
    user_file.save(user_file_path)

    translators.set_admin(user_utils.is_admin())
    translation_task_id = translators.translate_file(user_utils.get_uid(), engine_id, user_file_path, as_tmx, tmx_mode)

    return jsonify({ "result": 200, "task_id": translation_task_id })
예제 #6
0
    def process_bitext(file):
        file_name, file_extension = os.path.splitext(file.filename)
        norm_name = utils.normname(user_id=user_id, filename=file_name)
        tmp_file_fd, tmp_path = utils.tmpfile()
        file.save(tmp_path)

        data_utils.convert_file_to_utf8(tmp_path)
        data_utils.fix_file(tmp_path)

        if file_extension == ".tmx":
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'w') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'w') as trg_file, \
            open(tmp_path, 'rb') as tmx_file:
                inside_tuv = False
                seg_text = []
                tu = []

                def se(name, _):
                    nonlocal inside_tuv
                    if name == "seg":
                        inside_tuv = True

                def lp(line):
                    return re.sub(r'[\r\n\t\f\v]', " ", line.strip())

                def ee(name):
                    nonlocal inside_tuv, seg_text, tu, src_file
                    if name == "seg":
                        inside_tuv = False
                        tu.append("".join(seg_text))
                        seg_text = []

                        if len(tu) == 2:
                            print(lp(tu[0]), file=src_file)
                            print(lp(tu[1]), file=trg_file)
                            tu = []

                def cd(data):
                    nonlocal inside_tuv, seg_text
                    if inside_tuv:
                        seg_text.append(data)

                parser = xml.parsers.expat.ParserCreate()
                parser.StartElementHandler = se
                parser.EndElementHandler = ee
                parser.CharacterDataHandler = cd
                parser.ParseFile(tmx_file)

        else:
            # We assume it is a TSV
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmp_file:
                for line in tmp_file:
                    cols = line.strip().split('\t')
                    src_file.write((cols[0] + '\n').encode('utf-8'))
                    trg_file.write((cols[1] + '\n').encode('utf-8'))

        src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"),
                        'rb')
        trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"),
                        'rb')

        return FileStorage(src_file, filename=file.filename + "-src"), \
                FileStorage(trg_file, filename=file.filename + "-trg")
예제 #7
0
def upload_file(file,
                language,
                format="text",
                selected_size=None,
                offset=None,
                user_id=None):
    user_id = user_id if user_id else user_utils.get_uid()
    norm_name = utils.normname(user_id=user_id, filename=file.filename)
    path = utils.filepath('FILES_FOLDER', norm_name)

    def new_file(file, path, selected_size=None):
        # We save it
        file.seek(0)
        file.save(path)

        # Convert whatever format this has to UTF-8
        convert_file_to_utf8(path)
        fix_file(path)

        hash = utils.hash(file)

        if selected_size is not None:
            # We get the amount of sentences we want
            crop_path = "{}.crop".format(path)

            if offset:
                crop_proccess = subprocess.Popen(
                    "cat {} "
                    "| head -n {} "
                    "| tail -n {} > {}".format(
                        path,
                        int(offset) + int(selected_size), selected_size,
                        crop_path),
                    shell=True)
                crop_proccess.wait()
            else:
                crop_proccess = subprocess.Popen(
                    "cat {} | head -n {} > {}".format(path, selected_size,
                                                      crop_path),
                    shell=True)
                crop_proccess.wait()

            os.remove(path)
            shutil.move(crop_path, path)

            with open(path, 'r') as crop_file:
                hash = utils.hash(crop_file)

        # Get file stats
        wc_output = subprocess.check_output('wc -lwc {}'.format(path),
                                            shell=True)
        wc_output_search = re.search(r'^(\s*)(\d+)(\s+)(\d+)(\s+)(\d+)(.*)$',
                                     wc_output.decode("utf-8"))
        lines, words, chars = wc_output_search.group(
            2), wc_output_search.group(4), wc_output_search.group(6)

        # Save in DB
        db_file = File(path=path,
                       name=file.filename,
                       user_language_id=language,
                       hash=hash,
                       uploader_id=user_id,
                       lines=lines,
                       words=words,
                       chars=chars,
                       uploaded=datetime.datetime.utcnow())

        return db_file

    if selected_size is not None:
        return new_file(file, path, selected_size)
    else:
        # Could we already have it stored?
        hash = utils.hash(file)

        query = File.query.filter_by(hash=hash)
        db_file = None

        try:
            db_file = query.first()
            if db_file is None: raise NoResultFound

            # We did have it, we link a new one to the existing one instead of re-uploading
            os.link(db_file.path, path)

            db_file = File(path=path,
                           name=file.filename,
                           uploaded=db_file.uploaded,
                           hash=hash,
                           uploader_id=user_id,
                           language_id=db_file.language_id,
                           lines=db_file.lines,
                           words=db_file.words,
                           chars=db_file.chars)

        except NoResultFound:
            db_file = new_file(file, path)

        return db_file