Example #1
0
    def join_corpora(list_name, phase, source_lang, target_lang):
        corpus = Corpus(owner_id=user_id, visible=False)
        source_lang_id = UserLanguage.query.filter_by(
            user_id=user_id, code=source_lang).one().id
        for train_corpus in params[list_name]:
            corpus_data = json.loads(train_corpus)
            corpus_id = corpus_data['id']
            corpus_size = corpus_data['size']

            if corpus_id not in used_corpora: used_corpora[corpus_id] = 0

            try:
                og_corpus = Corpus.query.filter_by(id=corpus_id).first()

                # We relate the original corpus with this engine in the database,
                # for informational purposes. This way the user will be able to know
                # which corpora were used to train the engine
                engine.engine_corpora.append(
                    Corpus_Engine(corpus=og_corpus,
                                  engine=engine,
                                  phase=phase,
                                  is_info=True,
                                  selected_size=corpus_size))

                corpus.user_source_id = og_corpus.user_source_id
                corpus.user_target_id = og_corpus.user_target_id
                for file_entry in og_corpus.corpus_files:
                    with open(file_entry.file.path, 'rb') as file_d:
                        db_file = data_utils.upload_file(
                            FileStorage(stream=file_d,
                                        filename=file_entry.file.name),
                            file_entry.file.user_language_id,
                            selected_size=corpus_size,
                            offset=used_corpora[corpus_id],
                            user_id=user_id)
                    corpus.corpus_files.append(
                        Corpus_File(
                            db_file,
                            role="source" if file_entry.file.user_language_id
                            == source_lang_id else "target"))
                used_corpora[corpus_id] += corpus_size
            except:
                raise Exception

        try:
            db.session.add(corpus)
            db.session.commit()
        except:
            db.session.rollback()
            raise Exception

        # We put the contents of the several files in a new single one, and we shuffle the sentences
        try:
            data_utils.join_corpus_files(corpus, shuffle=True, user_id=user_id)
        except:
            db.session.delete(corpus)
            db.session.commit()
            raise Exception

        return corpus
Example #2
0
def test_find(tmpdir):
    corpus = Corpus(str(tmpdir))
    p = tmpdir.join("Andy_Warhol.rst")
    content = ("***********\n"
               "Andy Warhol\n"
               "***********\n"
               "**Andy Warhol** was an artist.")
    p.write(content)
    doc = corpus.find("Andy Warhol")
    assert doc.filename == "Andy_Warhol"
Example #3
0
def test_find(tmpdir):
    corpus = Corpus(str(tmpdir))
    p = tmpdir.join("Andy_Warhol.rst")
    content = ("***********\n"
               "Andy Warhol\n"
               "***********\n"
               "**Andy Warhol** was an artist.")
    p.write(content)
    doc = corpus.find("Andy Warhol")
    assert doc.filename == "Andy_Warhol"
Example #4
0
def dashboard():
    """admin dashboard page."""
    if current_user.is_admin():
        corpora = db.session.query(Corpus).all()
        control_lists = db.session.query(ControlLists).all()
    else:
        corpora = Corpus.for_user(current_user)
        control_lists = ControlLists.for_user(current_user)
    return render_template_with_nav_info('main/dashboard.html',
                                         current_user=current_user,
                                         dashboard_corpora=corpora,
                                         dashboard_control_lists=control_lists)
Example #5
0
    def test_db_create(self):
        """ Test that db is created """

        result = self.invoke(["db-create"])
        self.assertIn("Created the database", result.output)
        with self.app.app_context():
            cl = ControlLists(name="Corpus1")
            db.session.add(cl)
            db.session.flush()
            db.session.add(Corpus(name="Corpus1", control_lists_id=cl.id))
            db.session.commit()

            self.assertEqual(len(Corpus.query.all()), 1,
                             "There should have been an insert")
Example #6
0
    def add_n_corpora(self, n_corpus: int, **kwargs):
        if not self.AUTO_LOG_IN:
            raise Exception("This function only works with autologin")

        user = User.query.filter(
            User.email == self.app.config['ADMIN_EMAIL']).first()
        for n in range(n_corpus):
            corpus = Corpus(name="a" * n,
                            control_lists_id=1,
                            columns=[
                                Column(heading="Lemma"),
                                Column(heading="POS"),
                                Column(heading="Morph"),
                                Column(heading="Similar"),
                            ])
            new_cu = CorpusUser(corpus=corpus, user=user, is_owner=True)
            db.session.add(corpus)
            db.session.add(new_cu)
            db.session.flush()
        db.session.commit()
Example #7
0
def process_upload_request(self,
                           user_id,
                           bitext_path,
                           src_path,
                           trg_path,
                           src_lang,
                           trg_lang,
                           corpus_name,
                           corpus_desc="",
                           corpus_topic=None):
    type = "bitext" if bitext_path else "bilingual" if trg_path else "monolingual"

    def process_file(file, language, corpus, role):
        db_file = data_utils.upload_file(file, language, user_id=user_id)

        if role == "source":
            corpus.user_source_id = language
        else:
            corpus.user_target_id = language

        db.session.add(db_file)
        corpus.corpus_files.append(Corpus_File(db_file, role=role))

        return db_file

    def process_bitext(file):
        file_name, file_extension = os.path.splitext(file.filename)
        norm_name = utils.normname(user_id=user_id, filename=file_name)
        tmp_file_fd, tmp_path = utils.tmpfile()
        file.save(tmp_path)

        data_utils.convert_file_to_utf8(tmp_path)
        data_utils.fix_file(tmp_path)

        if file_extension == ".tmx":
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'w') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'w') as trg_file, \
            open(tmp_path, 'rb') as tmx_file:
                inside_tuv = False
                seg_text = []
                tu = []

                def se(name, _):
                    nonlocal inside_tuv
                    if name == "seg":
                        inside_tuv = True

                def lp(line):
                    return re.sub(r'[\r\n\t\f\v]', " ", line.strip())

                def ee(name):
                    nonlocal inside_tuv, seg_text, tu, src_file
                    if name == "seg":
                        inside_tuv = False
                        tu.append("".join(seg_text))
                        seg_text = []

                        if len(tu) == 2:
                            print(lp(tu[0]), file=src_file)
                            print(lp(tu[1]), file=trg_file)
                            tu = []

                def cd(data):
                    nonlocal inside_tuv, seg_text
                    if inside_tuv:
                        seg_text.append(data)

                parser = xml.parsers.expat.ParserCreate()
                parser.StartElementHandler = se
                parser.EndElementHandler = ee
                parser.CharacterDataHandler = cd
                parser.ParseFile(tmx_file)

        else:
            # We assume it is a TSV
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmp_file:
                for line in tmp_file:
                    cols = line.strip().split('\t')
                    src_file.write((cols[0] + '\n').encode('utf-8'))
                    trg_file.write((cols[1] + '\n').encode('utf-8'))

        src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"),
                        'rb')
        trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"),
                        'rb')

        return FileStorage(src_file, filename=file.filename + "-src"), \
                FileStorage(trg_file, filename=file.filename + "-trg")

    # We create the corpus, retrieve the files and attach them to that corpus
    target_db_file = None
    try:
        corpus = Corpus(name=corpus_name,
                        type="bilingual" if type == "bitext" else type,
                        owner_id=user_id,
                        description=corpus_desc,
                        topic_id=corpus_topic)

        if type == "bitext":
            with open(bitext_path, 'rb') as fbitext:
                bitext_file = FileStorage(fbitext,
                                          filename=os.path.basename(
                                              fbitext.name))
                src_file, trg_file = process_bitext(bitext_file)

                source_db_file = process_file(src_file, src_lang, corpus,
                                              'source')
                target_db_file = process_file(trg_file, trg_lang, corpus,
                                              'target')
        else:
            with open(src_path, 'rb') as fsrctext:
                src_file = FileStorage(fsrctext,
                                       filename=os.path.basename(
                                           fsrctext.name))
                source_db_file = process_file(src_file, src_lang, corpus,
                                              'source')

            if type == "bilingual":
                with open(trg_path, 'rb') as ftrgtext:
                    trg_file = FileStorage(ftrgtext,
                                           filename=os.path.basename(
                                               ftrgtext.name))
                    target_db_file = process_file(trg_file, trg_lang, corpus,
                                                  'target')

        db.session.add(corpus)

        user = User.query.filter_by(id=user_id).first()
        user.user_corpora.append(LibraryCorpora(corpus=corpus, user=user))
    except Exception as e:
        db.session.rollback()
        raise Exception(
            "Something went wrong on our end... Please, try again later")

    if target_db_file:
        source_lines = utils.file_length(source_db_file.path)
        target_lines = utils.file_length(target_db_file.path)

        if source_lines != target_lines:
            db.session.rollback()
            raise Exception(
                "Source and target file should have the same length")

    db.session.commit()

    return True
Example #8
0
from app.models import ChangeRecord, WordToken, Corpus, ControlLists
from .base import TestModels
import copy

SimilarityFixtures = [
    ControlLists(id=1, name="CL Fixture"),
    Corpus(id=1, name="Fixtures !", control_lists_id=1),
    WordToken(corpus=1,
              form="Cil",
              lemma="celui",
              left_context="_",
              right_context="_",
              label_uniform="celui",
              morph="smn",
              POS="p"),  # 1
    WordToken(corpus=1,
              form="Cil",
              lemma="celle",
              left_context="_",
              right_context="_",
              label_uniform="celle",
              morph="smn",
              POS="n"),  # 2
    WordToken(corpus=1,
              form="Cil",
              lemma="cil",
              left_context="_",
              right_context="_",
              label_uniform="cil",
              morph="smn",
              POS="p"),  # 3
Example #9
0
from app.models import Corpus, WordToken, AllowedLemma, AllowedPOS, AllowedMorph, Column
from app.models import ControlLists, ControlListsUser


Floovant = Corpus(
    name="Floovant",
    id=2,
    control_lists_id=2
)
FloovantColumns = [
    Column(heading="Lemma", corpus_id=2),
    Column(heading="POS", corpus_id=2),
    Column(heading="Morph", corpus_id=2),
    Column(heading="Similar", corpus_id=2),
]
FCL = ControlLists(id=2, name="Floovant")
FloovantTokens = [
    WordToken(corpus=Floovant.id, form="SOIGNORS", lemma="seignor", left_context="", right_context="or escoutez que",
              label_uniform="seignor", morph="NOMB.=p|GENRE=m|CAS=n"),
    WordToken(corpus=Floovant.id, form="or", lemma="or4", left_context="SOIGNORS", right_context="escoutez que Dés",
              label_uniform="or4", morph="DEGRE=-"),
    WordToken(corpus=Floovant.id, form="escoutez", lemma="escouter", left_context="SOIGNORS or",
              right_context="que Dés vos", label_uniform="escouter", morph="MODE=imp|PERS.=2|NOMB.=p"),
    WordToken(corpus=Floovant.id, form="que", lemma="que4", left_context="SOIGNORS or escoutez",
              right_context="Dés vos soit", label_uniform="que4", morph="_"),
    WordToken(corpus=Floovant.id, form="Dés", lemma="dieu", left_context="or escoutez que",
              right_context="vos soit amis", label_uniform="dieu", morph="NOMB.=s|GENRE=m|CAS=n"),
    WordToken(corpus=Floovant.id, form="vos", lemma="vos1", left_context="escoutez que Dés",
              right_context="soit amis III", label_uniform="vos1", morph="PERS.=2|NOMB.=p|GENRE=m|CAS=r"),
    WordToken(corpus=Floovant.id, form="soit", lemma="estre1", left_context="que Dés vos",
              right_context="amis III vers", label_uniform="estre1", morph="MODE=sub|TEMPS=pst|PERS.=3|NOMB.=s"),
Example #10
0
from app.models import Corpus, WordToken, Column
from app.models import ControlLists

control_list = ControlLists(id=3, name="Latin")
corpus = Corpus(
    name="Priapees",
    id=3,
    control_lists_id=control_list.id,
)
PriapeeColumns = [
    Column(heading="Lemma", corpus_id=3),
    Column(heading="POS", corpus_id=3),
    Column(heading="Morph", corpus_id=3),
    Column(heading="Similar", corpus_id=3),
]
tokens = [
    WordToken(corpus=corpus.id, form="Carminis", lemma="carmen1", POS="NOMcom", left_context="Carminis incompti lusus lecture", right_context="procaces ,", label_uniform="carmen1", morph="Case=Gen|Numb=Sing"),
    WordToken(corpus=corpus.id, form="incompti", lemma="incomptus", POS="ADJqua", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens", label_uniform="incomptus", morph="Case=Gen|Numb=Sing|Deg=Pos"),
    WordToken(corpus=corpus.id, form="lusus", lemma="lusus", POS="NOMcom", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens Latio", label_uniform="lusus", morph="Case=Gen|Numb=Sing"),
    WordToken(corpus=corpus.id, form="lecture", lemma="lego?", POS="VER", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens Latio pone", label_uniform="lego?", morph="Case=Voc|Numb=Sing|Mood=Par|Voice=Act"),
    WordToken(corpus=corpus.id, form="procaces", lemma="procax", POS="ADJqua", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens Latio pone supercilium", label_uniform="procax", morph="Case=Acc|Numb=Plur|Deg=Pos"),
    WordToken(corpus=corpus.id, form=",", lemma=",", POS="PUNC", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens Latio pone supercilium .", label_uniform=",", morph="MORPH=empty"),
    WordToken(corpus=corpus.id, form="conueniens", lemma="conueniens", POS="ADJqua", left_context="incompti lusus lecture procaces", right_context=", conueniens Latio pone supercilium . non", label_uniform="conueniens", morph="Case=Nom|Numb=Sing|Deg=Pos"),
    WordToken(corpus=corpus.id, form="Latio", lemma="latio", POS="NOMcom", left_context="lusus lecture procaces ,", right_context="conueniens Latio pone supercilium . non soror", label_uniform="latio", morph="Case=Nom|Numb=Sing"),
    WordToken(corpus=corpus.id, form="pone", lemma="pono", POS="VER", left_context="lecture procaces , conueniens", right_context="Latio pone supercilium . non soror hoc", label_uniform="pono", morph="Numb=Sing|Mood=Imp|Tense=Pres|Voice=Act|Person=2"),
    WordToken(corpus=corpus.id, form="supercilium", lemma="supercilium", POS="NOMcom", left_context="procaces , conueniens Latio", right_context="pone supercilium . non soror hoc habitat", label_uniform="supercilium", morph="Case=Acc|Numb=Sing"),
    WordToken(corpus=corpus.id, form=".", lemma=".", POS="PUNC", left_context=", conueniens Latio pone", right_context="supercilium . non soror hoc habitat Phoebi", label_uniform=".", morph="MORPH=empty"),
    WordToken(corpus=corpus.id, form="non", lemma="non", POS="ADVneg", left_context="conueniens Latio pone supercilium", right_context=". non soror hoc habitat Phoebi ,", label_uniform="non", morph="MORPH=empty"),
    WordToken(corpus=corpus.id, form="soror", lemma="soror", POS="NOMcom", left_context="Latio pone supercilium .", right_context="non soror hoc habitat Phoebi , non", label_uniform="soror", morph="Case=Nom|Numb=Sing"),
    WordToken(corpus=corpus.id, form="hoc", lemma="hic1", POS="PROdem", left_context="pone supercilium . non", right_context="soror hoc habitat Phoebi , non uesta", label_uniform="hic1", morph="Case=Nom|Numb=Sing"),
    WordToken(corpus=corpus.id, form="habitat", lemma="habito", POS="VER", left_context="supercilium . non soror", right_context="hoc habitat Phoebi , non uesta sacello", label_uniform="habito", morph="Numb=Sing|Mood=Ind|Tense=Pres|Voice=Act|Person=3"),
Example #11
0
def process_upload_request(self,
                           user_id,
                           bitext_path,
                           src_path,
                           trg_path,
                           src_lang,
                           trg_lang,
                           corpus_name,
                           corpus_desc="",
                           corpus_topic=None):
    type = "bitext" if bitext_path else "bilingual" if trg_path else "monolingual"

    def process_file(file, language, corpus, role):
        db_file = data_utils.upload_file(file, language, user_id=user_id)

        if role == "source":
            corpus.source_id = language
        else:
            corpus.target_id = language

        db.session.add(db_file)
        corpus.corpus_files.append(Corpus_File(db_file, role=role))

        return db_file

    def process_bitext(file):
        file_name, file_extension = os.path.splitext(file.filename)
        norm_name = utils.normname(user_id=user_id, filename=file_name)
        tmp_file_fd, tmp_path = utils.tmpfile()
        file.save(tmp_path)

        if file_extension == ".tmx":
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmx_file:
                tmx = etree.parse(tmx_file, etree.XMLParser())
                body = tmx.getroot().find("body")

                for tu in body.findall('.//tu'):
                    for i, tuv in enumerate(tu.findall('.//tuv')):
                        if i > 1: break
                        line = tuv.find("seg").text.strip()
                        line = re.sub(r'[\r\n\t\f\v]', " ", line)
                        dest_file = src_file if i == 0 else trg_file

                        dest_file.write(line.encode('utf-8'))
                        dest_file.write(os.linesep.encode('utf-8'))
        else:
            # We assume it is a TSV
            with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \
            open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \
            open(tmp_path, 'r') as tmp_file:
                for line in tmp_file:
                    cols = line.strip().split('\t')
                    src_file.write((cols[0] + '\n').encode('utf-8'))
                    trg_file.write((cols[1] + '\n').encode('utf-8'))

        src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"),
                        'rb')
        trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"),
                        'rb')

        return FileStorage(src_file, filename=file.filename + "-src"), \
                FileStorage(trg_file, filename=file.filename + "-trg")

    # We create the corpus, retrieve the files and attach them to that corpus
    target_db_file = None
    try:
        corpus = Corpus(name=corpus_name,
                        type="bilingual" if type == "bitext" else type,
                        owner_id=user_id,
                        description=corpus_desc,
                        topic_id=corpus_topic)

        if type == "bitext":
            with open(bitext_path, 'rb') as fbitext:
                bitext_file = FileStorage(fbitext,
                                          filename=os.path.basename(
                                              fbitext.name))
                src_file, trg_file = process_bitext(bitext_file)

                source_db_file = process_file(src_file, src_lang, corpus,
                                              'source')
                target_db_file = process_file(trg_file, trg_lang, corpus,
                                              'target')
        else:
            with open(src_path, 'rb') as fsrctext:
                src_file = FileStorage(fsrctext,
                                       filename=os.path.basename(
                                           fsrctext.name))
                source_db_file = process_file(src_file, src_lang, corpus,
                                              'source')

            if type == "bilingual":
                with open(trg_path, 'rb') as ftrgtext:
                    trg_file = FileStorage(ftrgtext,
                                           filename=os.path.basename(
                                               ftrgtext.name))
                    target_db_file = process_file(trg_file, trg_lang, corpus,
                                                  'target')

        db.session.add(corpus)

        user = User.query.filter_by(id=user_id).first()
        user.user_corpora.append(LibraryCorpora(corpus=corpus, user=user))
    except Exception as e:
        db.session.rollback()
        raise Exception(
            "Something went wrong on our end... Please, try again later")

    if target_db_file:
        source_lines = utils.file_length(source_db_file.path)
        target_lines = utils.file_length(target_db_file.path)

        if source_lines != target_lines:
            db.session.rollback()
            raise Exception(
                "Source and target file should have the same length")

    db.session.commit()

    return True