def join_corpora(list_name, phase, source_lang, target_lang): corpus = Corpus(owner_id=user_id, visible=False) source_lang_id = UserLanguage.query.filter_by( user_id=user_id, code=source_lang).one().id for train_corpus in params[list_name]: corpus_data = json.loads(train_corpus) corpus_id = corpus_data['id'] corpus_size = corpus_data['size'] if corpus_id not in used_corpora: used_corpora[corpus_id] = 0 try: og_corpus = Corpus.query.filter_by(id=corpus_id).first() # We relate the original corpus with this engine in the database, # for informational purposes. This way the user will be able to know # which corpora were used to train the engine engine.engine_corpora.append( Corpus_Engine(corpus=og_corpus, engine=engine, phase=phase, is_info=True, selected_size=corpus_size)) corpus.user_source_id = og_corpus.user_source_id corpus.user_target_id = og_corpus.user_target_id for file_entry in og_corpus.corpus_files: with open(file_entry.file.path, 'rb') as file_d: db_file = data_utils.upload_file( FileStorage(stream=file_d, filename=file_entry.file.name), file_entry.file.user_language_id, selected_size=corpus_size, offset=used_corpora[corpus_id], user_id=user_id) corpus.corpus_files.append( Corpus_File( db_file, role="source" if file_entry.file.user_language_id == source_lang_id else "target")) used_corpora[corpus_id] += corpus_size except: raise Exception try: db.session.add(corpus) db.session.commit() except: db.session.rollback() raise Exception # We put the contents of the several files in a new single one, and we shuffle the sentences try: data_utils.join_corpus_files(corpus, shuffle=True, user_id=user_id) except: db.session.delete(corpus) db.session.commit() raise Exception return corpus
def test_find(tmpdir): corpus = Corpus(str(tmpdir)) p = tmpdir.join("Andy_Warhol.rst") content = ("***********\n" "Andy Warhol\n" "***********\n" "**Andy Warhol** was an artist.") p.write(content) doc = corpus.find("Andy Warhol") assert doc.filename == "Andy_Warhol"
def dashboard(): """admin dashboard page.""" if current_user.is_admin(): corpora = db.session.query(Corpus).all() control_lists = db.session.query(ControlLists).all() else: corpora = Corpus.for_user(current_user) control_lists = ControlLists.for_user(current_user) return render_template_with_nav_info('main/dashboard.html', current_user=current_user, dashboard_corpora=corpora, dashboard_control_lists=control_lists)
def test_db_create(self): """ Test that db is created """ result = self.invoke(["db-create"]) self.assertIn("Created the database", result.output) with self.app.app_context(): cl = ControlLists(name="Corpus1") db.session.add(cl) db.session.flush() db.session.add(Corpus(name="Corpus1", control_lists_id=cl.id)) db.session.commit() self.assertEqual(len(Corpus.query.all()), 1, "There should have been an insert")
def add_n_corpora(self, n_corpus: int, **kwargs): if not self.AUTO_LOG_IN: raise Exception("This function only works with autologin") user = User.query.filter( User.email == self.app.config['ADMIN_EMAIL']).first() for n in range(n_corpus): corpus = Corpus(name="a" * n, control_lists_id=1, columns=[ Column(heading="Lemma"), Column(heading="POS"), Column(heading="Morph"), Column(heading="Similar"), ]) new_cu = CorpusUser(corpus=corpus, user=user, is_owner=True) db.session.add(corpus) db.session.add(new_cu) db.session.flush() db.session.commit()
def process_upload_request(self, user_id, bitext_path, src_path, trg_path, src_lang, trg_lang, corpus_name, corpus_desc="", corpus_topic=None): type = "bitext" if bitext_path else "bilingual" if trg_path else "monolingual" def process_file(file, language, corpus, role): db_file = data_utils.upload_file(file, language, user_id=user_id) if role == "source": corpus.user_source_id = language else: corpus.user_target_id = language db.session.add(db_file) corpus.corpus_files.append(Corpus_File(db_file, role=role)) return db_file def process_bitext(file): file_name, file_extension = os.path.splitext(file.filename) norm_name = utils.normname(user_id=user_id, filename=file_name) tmp_file_fd, tmp_path = utils.tmpfile() file.save(tmp_path) data_utils.convert_file_to_utf8(tmp_path) data_utils.fix_file(tmp_path) if file_extension == ".tmx": with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'w') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'w') as trg_file, \ open(tmp_path, 'rb') as tmx_file: inside_tuv = False seg_text = [] tu = [] def se(name, _): nonlocal inside_tuv if name == "seg": inside_tuv = True def lp(line): return re.sub(r'[\r\n\t\f\v]', " ", line.strip()) def ee(name): nonlocal inside_tuv, seg_text, tu, src_file if name == "seg": inside_tuv = False tu.append("".join(seg_text)) seg_text = [] if len(tu) == 2: print(lp(tu[0]), file=src_file) print(lp(tu[1]), file=trg_file) tu = [] def cd(data): nonlocal inside_tuv, seg_text if inside_tuv: seg_text.append(data) parser = xml.parsers.expat.ParserCreate() parser.StartElementHandler = se parser.EndElementHandler = ee parser.CharacterDataHandler = cd parser.ParseFile(tmx_file) else: # We assume it is a TSV with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmp_file: for line in tmp_file: cols = line.strip().split('\t') src_file.write((cols[0] + '\n').encode('utf-8')) trg_file.write((cols[1] + '\n').encode('utf-8')) src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'rb') trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'rb') return FileStorage(src_file, filename=file.filename + "-src"), \ FileStorage(trg_file, filename=file.filename + "-trg") # We create the corpus, retrieve the files and attach them to that corpus target_db_file = None try: corpus = Corpus(name=corpus_name, type="bilingual" if type == "bitext" else type, owner_id=user_id, description=corpus_desc, topic_id=corpus_topic) if type == "bitext": with open(bitext_path, 'rb') as fbitext: bitext_file = FileStorage(fbitext, filename=os.path.basename( fbitext.name)) src_file, trg_file = process_bitext(bitext_file) source_db_file = process_file(src_file, src_lang, corpus, 'source') target_db_file = process_file(trg_file, trg_lang, corpus, 'target') else: with open(src_path, 'rb') as fsrctext: src_file = FileStorage(fsrctext, filename=os.path.basename( fsrctext.name)) source_db_file = process_file(src_file, src_lang, corpus, 'source') if type == "bilingual": with open(trg_path, 'rb') as ftrgtext: trg_file = FileStorage(ftrgtext, filename=os.path.basename( ftrgtext.name)) target_db_file = process_file(trg_file, trg_lang, corpus, 'target') db.session.add(corpus) user = User.query.filter_by(id=user_id).first() user.user_corpora.append(LibraryCorpora(corpus=corpus, user=user)) except Exception as e: db.session.rollback() raise Exception( "Something went wrong on our end... Please, try again later") if target_db_file: source_lines = utils.file_length(source_db_file.path) target_lines = utils.file_length(target_db_file.path) if source_lines != target_lines: db.session.rollback() raise Exception( "Source and target file should have the same length") db.session.commit() return True
from app.models import ChangeRecord, WordToken, Corpus, ControlLists from .base import TestModels import copy SimilarityFixtures = [ ControlLists(id=1, name="CL Fixture"), Corpus(id=1, name="Fixtures !", control_lists_id=1), WordToken(corpus=1, form="Cil", lemma="celui", left_context="_", right_context="_", label_uniform="celui", morph="smn", POS="p"), # 1 WordToken(corpus=1, form="Cil", lemma="celle", left_context="_", right_context="_", label_uniform="celle", morph="smn", POS="n"), # 2 WordToken(corpus=1, form="Cil", lemma="cil", left_context="_", right_context="_", label_uniform="cil", morph="smn", POS="p"), # 3
from app.models import Corpus, WordToken, AllowedLemma, AllowedPOS, AllowedMorph, Column from app.models import ControlLists, ControlListsUser Floovant = Corpus( name="Floovant", id=2, control_lists_id=2 ) FloovantColumns = [ Column(heading="Lemma", corpus_id=2), Column(heading="POS", corpus_id=2), Column(heading="Morph", corpus_id=2), Column(heading="Similar", corpus_id=2), ] FCL = ControlLists(id=2, name="Floovant") FloovantTokens = [ WordToken(corpus=Floovant.id, form="SOIGNORS", lemma="seignor", left_context="", right_context="or escoutez que", label_uniform="seignor", morph="NOMB.=p|GENRE=m|CAS=n"), WordToken(corpus=Floovant.id, form="or", lemma="or4", left_context="SOIGNORS", right_context="escoutez que Dés", label_uniform="or4", morph="DEGRE=-"), WordToken(corpus=Floovant.id, form="escoutez", lemma="escouter", left_context="SOIGNORS or", right_context="que Dés vos", label_uniform="escouter", morph="MODE=imp|PERS.=2|NOMB.=p"), WordToken(corpus=Floovant.id, form="que", lemma="que4", left_context="SOIGNORS or escoutez", right_context="Dés vos soit", label_uniform="que4", morph="_"), WordToken(corpus=Floovant.id, form="Dés", lemma="dieu", left_context="or escoutez que", right_context="vos soit amis", label_uniform="dieu", morph="NOMB.=s|GENRE=m|CAS=n"), WordToken(corpus=Floovant.id, form="vos", lemma="vos1", left_context="escoutez que Dés", right_context="soit amis III", label_uniform="vos1", morph="PERS.=2|NOMB.=p|GENRE=m|CAS=r"), WordToken(corpus=Floovant.id, form="soit", lemma="estre1", left_context="que Dés vos", right_context="amis III vers", label_uniform="estre1", morph="MODE=sub|TEMPS=pst|PERS.=3|NOMB.=s"),
from app.models import Corpus, WordToken, Column from app.models import ControlLists control_list = ControlLists(id=3, name="Latin") corpus = Corpus( name="Priapees", id=3, control_lists_id=control_list.id, ) PriapeeColumns = [ Column(heading="Lemma", corpus_id=3), Column(heading="POS", corpus_id=3), Column(heading="Morph", corpus_id=3), Column(heading="Similar", corpus_id=3), ] tokens = [ WordToken(corpus=corpus.id, form="Carminis", lemma="carmen1", POS="NOMcom", left_context="Carminis incompti lusus lecture", right_context="procaces ,", label_uniform="carmen1", morph="Case=Gen|Numb=Sing"), WordToken(corpus=corpus.id, form="incompti", lemma="incomptus", POS="ADJqua", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens", label_uniform="incomptus", morph="Case=Gen|Numb=Sing|Deg=Pos"), WordToken(corpus=corpus.id, form="lusus", lemma="lusus", POS="NOMcom", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens Latio", label_uniform="lusus", morph="Case=Gen|Numb=Sing"), WordToken(corpus=corpus.id, form="lecture", lemma="lego?", POS="VER", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens Latio pone", label_uniform="lego?", morph="Case=Voc|Numb=Sing|Mood=Par|Voice=Act"), WordToken(corpus=corpus.id, form="procaces", lemma="procax", POS="ADJqua", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens Latio pone supercilium", label_uniform="procax", morph="Case=Acc|Numb=Plur|Deg=Pos"), WordToken(corpus=corpus.id, form=",", lemma=",", POS="PUNC", left_context="Carminis incompti lusus lecture", right_context="procaces , conueniens Latio pone supercilium .", label_uniform=",", morph="MORPH=empty"), WordToken(corpus=corpus.id, form="conueniens", lemma="conueniens", POS="ADJqua", left_context="incompti lusus lecture procaces", right_context=", conueniens Latio pone supercilium . non", label_uniform="conueniens", morph="Case=Nom|Numb=Sing|Deg=Pos"), WordToken(corpus=corpus.id, form="Latio", lemma="latio", POS="NOMcom", left_context="lusus lecture procaces ,", right_context="conueniens Latio pone supercilium . non soror", label_uniform="latio", morph="Case=Nom|Numb=Sing"), WordToken(corpus=corpus.id, form="pone", lemma="pono", POS="VER", left_context="lecture procaces , conueniens", right_context="Latio pone supercilium . non soror hoc", label_uniform="pono", morph="Numb=Sing|Mood=Imp|Tense=Pres|Voice=Act|Person=2"), WordToken(corpus=corpus.id, form="supercilium", lemma="supercilium", POS="NOMcom", left_context="procaces , conueniens Latio", right_context="pone supercilium . non soror hoc habitat", label_uniform="supercilium", morph="Case=Acc|Numb=Sing"), WordToken(corpus=corpus.id, form=".", lemma=".", POS="PUNC", left_context=", conueniens Latio pone", right_context="supercilium . non soror hoc habitat Phoebi", label_uniform=".", morph="MORPH=empty"), WordToken(corpus=corpus.id, form="non", lemma="non", POS="ADVneg", left_context="conueniens Latio pone supercilium", right_context=". non soror hoc habitat Phoebi ,", label_uniform="non", morph="MORPH=empty"), WordToken(corpus=corpus.id, form="soror", lemma="soror", POS="NOMcom", left_context="Latio pone supercilium .", right_context="non soror hoc habitat Phoebi , non", label_uniform="soror", morph="Case=Nom|Numb=Sing"), WordToken(corpus=corpus.id, form="hoc", lemma="hic1", POS="PROdem", left_context="pone supercilium . non", right_context="soror hoc habitat Phoebi , non uesta", label_uniform="hic1", morph="Case=Nom|Numb=Sing"), WordToken(corpus=corpus.id, form="habitat", lemma="habito", POS="VER", left_context="supercilium . non soror", right_context="hoc habitat Phoebi , non uesta sacello", label_uniform="habito", morph="Numb=Sing|Mood=Ind|Tense=Pres|Voice=Act|Person=3"),
def process_upload_request(self, user_id, bitext_path, src_path, trg_path, src_lang, trg_lang, corpus_name, corpus_desc="", corpus_topic=None): type = "bitext" if bitext_path else "bilingual" if trg_path else "monolingual" def process_file(file, language, corpus, role): db_file = data_utils.upload_file(file, language, user_id=user_id) if role == "source": corpus.source_id = language else: corpus.target_id = language db.session.add(db_file) corpus.corpus_files.append(Corpus_File(db_file, role=role)) return db_file def process_bitext(file): file_name, file_extension = os.path.splitext(file.filename) norm_name = utils.normname(user_id=user_id, filename=file_name) tmp_file_fd, tmp_path = utils.tmpfile() file.save(tmp_path) if file_extension == ".tmx": with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmx_file: tmx = etree.parse(tmx_file, etree.XMLParser()) body = tmx.getroot().find("body") for tu in body.findall('.//tu'): for i, tuv in enumerate(tu.findall('.//tuv')): if i > 1: break line = tuv.find("seg").text.strip() line = re.sub(r'[\r\n\t\f\v]', " ", line) dest_file = src_file if i == 0 else trg_file dest_file.write(line.encode('utf-8')) dest_file.write(os.linesep.encode('utf-8')) else: # We assume it is a TSV with open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'wb') as src_file, \ open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'wb') as trg_file, \ open(tmp_path, 'r') as tmp_file: for line in tmp_file: cols = line.strip().split('\t') src_file.write((cols[0] + '\n').encode('utf-8')) trg_file.write((cols[1] + '\n').encode('utf-8')) src_file = open(utils.filepath('FILES_FOLDER', norm_name + "-src"), 'rb') trg_file = open(utils.filepath('FILES_FOLDER', norm_name + "-trg"), 'rb') return FileStorage(src_file, filename=file.filename + "-src"), \ FileStorage(trg_file, filename=file.filename + "-trg") # We create the corpus, retrieve the files and attach them to that corpus target_db_file = None try: corpus = Corpus(name=corpus_name, type="bilingual" if type == "bitext" else type, owner_id=user_id, description=corpus_desc, topic_id=corpus_topic) if type == "bitext": with open(bitext_path, 'rb') as fbitext: bitext_file = FileStorage(fbitext, filename=os.path.basename( fbitext.name)) src_file, trg_file = process_bitext(bitext_file) source_db_file = process_file(src_file, src_lang, corpus, 'source') target_db_file = process_file(trg_file, trg_lang, corpus, 'target') else: with open(src_path, 'rb') as fsrctext: src_file = FileStorage(fsrctext, filename=os.path.basename( fsrctext.name)) source_db_file = process_file(src_file, src_lang, corpus, 'source') if type == "bilingual": with open(trg_path, 'rb') as ftrgtext: trg_file = FileStorage(ftrgtext, filename=os.path.basename( ftrgtext.name)) target_db_file = process_file(trg_file, trg_lang, corpus, 'target') db.session.add(corpus) user = User.query.filter_by(id=user_id).first() user.user_corpora.append(LibraryCorpora(corpus=corpus, user=user)) except Exception as e: db.session.rollback() raise Exception( "Something went wrong on our end... Please, try again later") if target_db_file: source_lines = utils.file_length(source_db_file.path) target_lines = utils.file_length(target_db_file.path) if source_lines != target_lines: db.session.rollback() raise Exception( "Source and target file should have the same length") db.session.commit() return True