Example #1
0
def evaluate_files():
    mt_files = request.files.getlist('mt_files[]')
    ht_files = request.files.getlist('ht_files[]')
    source_file = request.files.get('source_file')

    line_length = None

    def save_file(file, path, limit=500):
        with open(path, 'w') as output_file:
            for i, line in enumerate(file):
                if i < limit:
                    print(line.decode('utf-8').strip(), file=output_file)

    mt_paths = []
    for mt_file in mt_files:
        mt_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), mt_file.filename))
        save_file(mt_file, mt_path)

        if not line_length:
            line_length = utils.file_length(mt_path)
        elif utils.file_length(mt_path) != line_length:
            return ({"result": "-1"})

        mt_paths.append(mt_path)

    ht_paths = []
    for ht_file in ht_files:
        ht_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), ht_file.filename))
        save_file(ht_file, ht_path)

        if not line_length:
            line_length = utils.file_length(ht_path)
        elif utils.file_length(ht_path) != line_length:
            return ({"result": "-1"})

        ht_paths.append(ht_path)

    if source_file:
        source_path = utils.filepath(
            'FILES_FOLDER',
            utils.normname(user_utils.get_uid(), source_file.filename))
        save_file(source_file, source_path)

        if utils.file_length(ht_path) != utils.file_length(source_path):
            return ({"result": "-1"})

    task = tasks.evaluate_files.apply_async(
        args=[user_utils.get_uid(), mt_paths, ht_paths],
        kwargs={'source_path': source_path if source_file else None})
    return ({"result": 200, "task_id": task.id})
Example #2
0
def shuffle_sentences(corpus):
    source_files = [f.file for f in corpus.corpus_files if f.role == "source"]
    target_files = [f.file for f in corpus.corpus_files if f.role == "target"]

    # Only shuffle single file corpora
    if len(source_files) == 1 and len(target_files) == 1:
        source_file, target_file = source_files[0], target_files[0]

        shuff_proc = subprocess.Popen(
            "paste {} {} | shuf > mut.{}.shuf".format(source_file.path,
                                                      target_file.path,
                                                      corpus.id),
            shell=True,
            cwd=app.config['TMP_FOLDER'])
        shuff_proc.wait()

        extract_source = subprocess.Popen(
            "cat mut.{}.shuf | awk -F '\\t' '{{ print $1 }}' > {}".format(
                corpus.id, source_file.path),
            shell=True,
            cwd=app.config['TMP_FOLDER'])
        extract_source.wait()

        extract_target = subprocess.Popen(
            "cat mut.{}.shuf | awk -F '\\t' '{{ print $2 }}' > {}".format(
                corpus.id, target_file.path),
            shell=True,
            cwd=app.config['TMP_FOLDER'])
        extract_target.wait()

        os.remove(
            utils.filepath('TMP_FOLDER',
                           filename='mut.{}.shuf'.format(corpus.id)))
    else:
        raise Exception("Corpora with multiple files cannot be shuffled")
Example #3
0
def train_tokenizer(engine, corpus, vocabularySize=32000):
    model_path = os.path.join(engine.path, 'train.model')
    vocab_path = os.path.join(engine.path, 'train.vocab')

    try:
        os.stat(model_path)
        os.stat(vocab_path)
    except:
        files_list = []
        for file_entry in corpus.corpus_files:
            files_list.append(file_entry.file.path)
        files = " ".join(files_list)
        random_sample_path = utils.tmpfile(
            filename="{}.mut.10m".format(corpus.id))
        cat_proc = subprocess.Popen(
            "cat {} | shuf | head -n 10000000 > {}".format(
                files, random_sample_path),
            shell=True)
        cat_proc.wait()

        train_proc = subprocess.Popen("spm_train --input={} --model_prefix=mut.{} --vocab_size={} --hard_vocab_limit=false" \
                        .format(random_sample_path, corpus.id, vocabularySize),
                        cwd=utils.filepath('TMP_FOLDER'), shell=True)
        train_proc.wait()

        shutil.move(
            utils.filepath('TMP_FOLDER', "mut.{}.model".format(corpus.id)),
            model_path)
        shutil.move(
            utils.filepath('TMP_FOLDER', "mut.{}.vocab".format(corpus.id)),
            vocab_path)
        os.remove(random_sample_path)

        purge_vocab = subprocess.Popen(
            "cat {} | awk -F '\\t' '{{ print $1 }}' > {}.purged".format(
                vocab_path, vocab_path),
            shell=True)
        purge_vocab.wait()

        os.remove(vocab_path)
        shutil.move("{}.purged".format(vocab_path), vocab_path)

    return model_path, vocab_path
Example #4
0
    def process_bitext(file):
        file_name, file_extension = os.path.splitext(file.filename)
        norm_name = utils.normname(user_id=user_id, filename=file_name)
        tmp_file_fd, tmp_path = utils.tmpfile()
        file.save(tmp_path)

        if file_extension == ".tmx":
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmx_file:
                tmx = etree.parse(tmx_file, etree.XMLParser())
                body = tmx.getroot().find("body")

                for tu in body.findall('.//tu'):
                    for i, tuv in enumerate(tu.findall('.//tuv')):
                        if i > 1: break
                        line = tuv.find("seg").text.strip()
                        line = re.sub(r'[\r\n\t\f\v]', " ", line)
                        dest_file = src_file if i == 0 else trg_file

                        dest_file.write(line.encode('utf-8'))
                        dest_file.write(os.linesep.encode('utf-8'))
        else:
            # We assume it is a TSV
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmp_file:
                for line in tmp_file:
                    cols = line.strip().split('\t')
                    src_file.write((cols[0] + '\n').encode('utf-8'))
                    trg_file.write((cols[1] + '\n').encode('utf-8'))

        src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"),
                        'rb')
        trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"),
                        'rb')

        return FileStorage(src_file, filename=file.filename + "-src"), \
                FileStorage(trg_file, filename=file.filename + "-trg")
Example #5
0
    def process_bitext(file):
        file_name, file_extension = os.path.splitext(file.filename)
        norm_name = utils.normname(user_id=user_id, filename=file_name)
        tmp_file_fd, tmp_path = utils.tmpfile()
        file.save(tmp_path)

        data_utils.convert_file_to_utf8(tmp_path)
        data_utils.fix_file(tmp_path)

        if file_extension == ".tmx":
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'w') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'w') as trg_file, \
            open(tmp_path, 'rb') as tmx_file:
                inside_tuv = False
                seg_text = []
                tu = []

                def se(name, _):
                    nonlocal inside_tuv
                    if name == "seg":
                        inside_tuv = True

                def lp(line):
                    return re.sub(r'[\r\n\t\f\v]', " ", line.strip())

                def ee(name):
                    nonlocal inside_tuv, seg_text, tu, src_file
                    if name == "seg":
                        inside_tuv = False
                        tu.append("".join(seg_text))
                        seg_text = []

                        if len(tu) == 2:
                            print(lp(tu[0]), file=src_file)
                            print(lp(tu[1]), file=trg_file)
                            tu = []

                def cd(data):
                    nonlocal inside_tuv, seg_text
                    if inside_tuv:
                        seg_text.append(data)

                parser = xml.parsers.expat.ParserCreate()
                parser.StartElementHandler = se
                parser.EndElementHandler = ee
                parser.CharacterDataHandler = cd
                parser.ParseFile(tmx_file)

        else:
            # We assume it is a TSV
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmp_file:
                for line in tmp_file:
                    cols = line.strip().split('\t')
                    src_file.write((cols[0] + '\n').encode('utf-8'))
                    trg_file.write((cols[1] + '\n').encode('utf-8'))

        src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"),
                        'rb')
        trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"),
                        'rb')

        return FileStorage(src_file, filename=file.filename + "-src"), \
                FileStorage(trg_file, filename=file.filename + "-trg")
Example #6
0
def upload_file(file,
                language,
                format="text",
                selected_size=None,
                offset=None,
                user_id=None):
    user_id = user_id if user_id else user_utils.get_uid()
    norm_name = utils.normname(user_id=user_id, filename=file.filename)
    path = utils.filepath('FILES_FOLDER', norm_name)

    def new_file(file, path, selected_size=None):
        # We save it
        file.seek(0)
        file.save(path)

        # Convert whatever format this has to UTF-8
        convert_file_to_utf8(path)
        fix_file(path)

        hash = utils.hash(file)

        if selected_size is not None:
            # We get the amount of sentences we want
            crop_path = "{}.crop".format(path)

            if offset:
                crop_proccess = subprocess.Popen(
                    "cat {} "
                    "| head -n {} "
                    "| tail -n {} > {}".format(
                        path,
                        int(offset) + int(selected_size), selected_size,
                        crop_path),
                    shell=True)
                crop_proccess.wait()
            else:
                crop_proccess = subprocess.Popen(
                    "cat {} | head -n {} > {}".format(path, selected_size,
                                                      crop_path),
                    shell=True)
                crop_proccess.wait()

            os.remove(path)
            shutil.move(crop_path, path)

            with open(path, 'r') as crop_file:
                hash = utils.hash(crop_file)

        # Get file stats
        wc_output = subprocess.check_output('wc -lwc {}'.format(path),
                                            shell=True)
        wc_output_search = re.search(r'^(\s*)(\d+)(\s+)(\d+)(\s+)(\d+)(.*)$',
                                     wc_output.decode("utf-8"))
        lines, words, chars = wc_output_search.group(
            2), wc_output_search.group(4), wc_output_search.group(6)

        # Save in DB
        db_file = File(path=path,
                       name=file.filename,
                       user_language_id=language,
                       hash=hash,
                       uploader_id=user_id,
                       lines=lines,
                       words=words,
                       chars=chars,
                       uploaded=datetime.datetime.utcnow())

        return db_file

    if selected_size is not None:
        return new_file(file, path, selected_size)
    else:
        # Could we already have it stored?
        hash = utils.hash(file)

        query = File.query.filter_by(hash=hash)
        db_file = None

        try:
            db_file = query.first()
            if db_file is None: raise NoResultFound

            # We did have it, we link a new one to the existing one instead of re-uploading
            os.link(db_file.path, path)

            db_file = File(path=path,
                           name=file.filename,
                           uploaded=db_file.uploaded,
                           hash=hash,
                           uploader_id=user_id,
                           language_id=db_file.language_id,
                           lines=db_file.lines,
                           words=db_file.words,
                           chars=db_file.chars)

        except NoResultFound:
            db_file = new_file(file, path)

        return db_file