def add_morpheme_paradigm(self, script: Script, translations, comments): db = IEMLDatabase(folder=self.gitdb.folder, use_cache=self.use_cache, cache_folder=self.cache_folder) d = db.get_dictionary() script = _check_script(script) if len(script) == 1: raise ValueError( "The script is not a paradigm {}, can't use it to define a paradigm." .format(str(script))) if script in d.scripts: raise ValueError( "Script {} already defined in the dictionary".format( str(script))) r_cand = set() for ss in script.singular_sequences: try: r_cand.add(d.tables.root(ss)) except KeyError: raise ValueError( "No root paradigms contains this script {}".format( str(script))) if len(r_cand) != 1: raise ValueError( "No root paradigms or too many for script {}".format( str(script))) root = next(iter(r_cand)) descriptors = db.get_descriptors() message = "[dictionary] Create paradigm {} ({}) for root paradigm {} ({})"\ .format(str(script), " / ".join( "{}:{}".format(l, ', '.join(descriptors.get_values(script, l, 'translations'))) for l in LANGUAGES), str(root), " / ".join( "{}:{}".format(l, ', '.join(descriptors.get_values(root, l, 'translations'))) for l in LANGUAGES)) with self.gitdb.commit(self.signature, message): db.remove_descriptor(script) db.remove_structure(script) db.add_structure(script, 'is_root', False) for l in LANGUAGES: for v in translations[l]: db.add_descriptor(script, language=l, descriptor='translations', value=v) for v in comments[l]: db.add_descriptor(script, language=l, descriptor='comments', value=v)
def delete_morpheme_root_paradigm(self, script: Script, empty_descriptors=True): db = IEMLDatabase(folder=self.gitdb.folder, use_cache=self.use_cache, cache_folder=self.cache_folder) d = db.get_dictionary() descriptors = db.get_descriptors() script = _check_script(script) if script not in d.tables.roots: raise ValueError("Script {} is not a root paradigm".format( str(script))) message = "[dictionary] Remove root paradigm {} ({})"\ .format(str(script), " / ".join("{}:{}".format(l, ', '.join(descriptors.get_values(script, l, 'translations'))) for l in LANGUAGES)) with self.gitdb.commit(self.signature, message): db.remove_structure(script) if empty_descriptors: for s in list(d.relations.object(script, 'contains')): db.remove_descriptor(s)
def test_merge_conflict(self): # clone two id repository gitA, gitB = init_repo(['/tmp/iemldb_test/A', '/tmp/iemldb_test/B']) # gitA = gitdbs[0] # gitB = gitdbs[1] print("Building DB...") dbA = IEMLDatabase(folder=gitA.folder) dbB = IEMLDatabase(folder=gitB.folder) # commit two differents values ieml = "(a.)" name = 'A' valueA = { 'translations': {'fr': ['test' + name], 'en': ['test' + name]}, 'comments': {'fr': ['test' + name], 'en': ['test' + name]}, 'tags': {'fr': ['test' + name], 'en': ['test' + name]} } commit(gitA, dbA, ieml, 'A', valueA) name = 'B' valueB = { 'translations': {'fr': ['test' + name], 'en': ['test' + name]}, 'comments': {'fr': ['test' + name], 'en': ['test' + name]}, 'tags': {'fr': ['test' + name], 'en': ['test' + name]} } commit(gitB, dbB, ieml, 'B', valueB) # then a same value commit ieml2 = '(b.)' commit(gitA, dbA, ieml2, 'A', valueB) commit(gitB, dbB, ieml2, 'B', valueB) # then a commit only on A ieml3 = '(s.)' commit(gitA, dbA, ieml3, 'A', valueA) # set B as a remote for A gitA.add_remote('B', os.path.join(gitB.folder, '.git')) print("Pulling A from B") conflicts = gitA.pull('B') print(conflicts) dbA = IEMLDatabase(folder=gitA.folder) descA = dbA.get_descriptors() self.assertDictEqual(descA.get_descriptor(ieml), valueB) self.assertDictEqual(conflicts[ieml], valueA)
def migrate(function, _s_old, _s_new): assert function(_s_old) == _s_new folder = '/tmp/migrate_script_iemldb' if os.path.isdir(folder): shutil.rmtree(folder) # os.mkdir(folder) git_address = "https://github.com/IEMLdev/ieml-language.git" credentials = pygit2.Keypair('ogrergo', '~/.ssh/id_rsa.pub', '~/.ssh/id_rsa', None) gitdb = GitInterface(origin=git_address, credentials=credentials, folder=folder) signature = pygit2.Signature("Louis van Beurden", "*****@*****.**") db = IEMLDatabase(folder=folder, use_cache=False) to_migrate = {} desc = db.get_descriptors() struct = db.get_structure() for s in db.get_dictionary().scripts: s2 = function(s) if s2 != s: to_migrate[s] = s2 print(to_migrate) with gitdb.commit( signature, "[Translate script] Translate paradigm from '{}' to '{}".format( str(_s_old), str(_s_new))): for s_old, s_new in to_migrate.items(): db.remove_structure(s_old) for (_, key), values in struct.get_values_partial(s_old).items(): for v in values: db.add_structure(s_new, key, v) db.remove_descriptor(s_old) for (_, lang, d), values in desc.get_values_partial(s_old).items(): for v in values: db.add_descriptor(s_new, lang, d, v)
def set_descriptors(self, ieml, descriptor, value): db = IEMLDatabase(folder=self.gitdb.folder, use_cache=self.use_cache, cache_folder=self.cache_folder) ieml = _check_ieml(ieml) value = _check_descriptors(value) desc = db.get_descriptors() old_trans = { l: desc.get_values(ieml=ieml, language=l, descriptor=descriptor) for l in LANGUAGES } if all(sorted(value[l]) == sorted(old_trans[l]) for l in LANGUAGES): error("No update needed, db already contains {}:{} for {}".format( descriptor, json.dumps(value), str(ieml))) return False # test if after modification there is still at least a descriptor if all(not (desc.get_values(ieml=ieml, language=l, descriptor=d ) if d != descriptor else value[l]) for l in LANGUAGES for d in DESCRIPTORS_CLASS): error('[descriptors] Remove {}'.format(str(ieml))) with self.gitdb.commit(self.signature, '[descriptors] Remove {}'.format( str(ieml))): db.remove_descriptor(ieml) return True # to_add = {l: [e for e in value[l] if e not in old_trans[l]] for l in LANGUAGES} # to_remove = {l: [e for e in old_trans[l] if e not in value[l]] for l in LANGUAGES} with self.gitdb.commit( self.signature, '[descriptors] Update {} for {} to {}'.format( descriptor, str(ieml), json.dumps(value))): db.remove_descriptor(ieml, None, descriptor) for l in LANGUAGES: for e in value[l]: db.add_descriptor(ieml, l, descriptor, e) return True
def update_all_ieml(self, f, message: str): db = IEMLDatabase(folder=self.gitdb.folder, use_cache=self.use_cache, cache_folder=self.cache_folder) desc = db.get_descriptors() with self.gitdb.commit( self.signature, '[IEML migration] Update all ieml in db: {}'.format(message)): for old_ieml in tqdm.tqdm(db.list(parse=True), "Migrate all usls"): new_ieml = f(old_ieml) value = desc.get_values_partial(old_ieml) db.remove_descriptor(old_ieml, None, None) for l in LANGUAGES: for d in value[l]: for e in value[l][e]: db.add_descriptor(new_ieml, l, d, e)
def delete_morpheme_paradigm(self, script: Script): db = IEMLDatabase(folder=self.gitdb.folder, use_cache=self.use_cache, cache_folder=self.cache_folder) d = db.get_dictionary() descriptors = db.get_descriptors() script = _check_script(script) if script in d.scripts and len(script) == 1: raise ValueError("Script {} is not a paradigm".format(str(script))) root = d.tables.root(script) message = "[dictionary] Remove paradigm {} ({})"\ .format(str(script), " / ".join( "{}:{}".format(l, ', '.join(descriptors.get_values(script, l, 'translations'))) for l in LANGUAGES), str(root), " / ".join( "{}:{}".format(l, ', '.join(descriptors.get_values(root, l, 'translations'))) for l in LANGUAGES)) with self.gitdb.commit(self.signature, message): db.remove_structure(script) db.remove_descriptor(script)
gitdb = GitInterface(origin="https://github.com/plevyieml/ieml-language") gitdb.pull() # download database in ~/.cache/ieml/ folder # instanciate a ieml.ieml_database.IEMLDatabase from the downloaded git repository db = IEMLDatabase(folder=gitdb.folder) # usls = db.list(parse=True, type='word') # # parsed_usls = list() # for e in tqdm(usls): # parsed_usls.append(get_word_structure(e)) # # with bz2.open(WORDS_FILENAME + ".bz2", "wt") as fout: # json.dump(parsed_usls, fout, indent=2) descriptors = db.get_descriptors() usls = db.list() translations = list() for e in tqdm(usls): assert (e not in translations) tr_dict = dict() values = descriptors.get_values_partial(e) for (usl, lang, label), tr_list in values.items(): assert (usl == e) if label == "translations": assert (lang not in tr_dict) tr_dict[lang] = tr_list translations.append({"usl": e, "translations": tr_dict})
def create_root_paradigm(self, root, inhibitions, translations, comments): db = IEMLDatabase(folder=self.gitdb.folder, use_cache=self.use_cache, cache_folder=self.cache_folder) root = _check_script(root) if len(root) == 1: raise ValueError( "The script is not a paradigm {}, can't use it to define a root paradigm." .format(str(root))) translations = _check_descriptors(translations) comments = _check_descriptors(comments) # if not already exists (no descriptor no structures) if db.get_descriptors().get_values_partial(root): raise ValueError( "Script {} already exists in dictionary".format(root)) dictionary = db.get_dictionary() for ss in root.singular_sequences: try: r = dictionary.tables.root(ss) raise ValueError( "Root paradigms {} intersection with script {} ".format( str(r), str(root))) except KeyError: pass with self.gitdb.commit( self.signature, "[dictionary] Create root paradigm {} ({}), create {} singular sequences" .format( str(root), " / ".join("{}:{}".format( l, ', '.join(db.get_descriptors().get_values( str(root), l, 'translations'))) for l in LANGUAGES), len(root.singular_sequences)), ): db.remove_structure(root, 'is_root') db.add_structure(root, 'is_root', True) for i in _check_inhibitions(inhibitions): db.add_structure(root, 'inhibition', i) for l in LANGUAGES: for v in translations[l]: db.add_descriptor(root, language=l, descriptor='translations', value=v) for v in comments[l]: db.add_descriptor(root, language=l, descriptor='comments', value=v) # add main tables header for i, t in enumerate([tt for tt in root.tables_script if tt != root]): self.add_morpheme_paradigm( t, translations=append_idx_to_dict(translations, i), comments=append_idx_to_dict(comments, i))
def update_morpheme_paradigm( self, script_old: Script, script_new: Script, ): script_old = _check_script(script_old) script_new = _check_script(script_new) if script_old == script_new: return assert len(script_old) != 1 or len( script_new) != 1, "Can't update singular sequences, only paradigms" db = IEMLDatabase(folder=self.gitdb.folder, use_cache=self.use_cache, cache_folder=self.cache_folder) d = db.get_dictionary() desc = db.get_descriptors() ds = db.get_structure() assert script_old in d.scripts, "Source script not defined in dictionary" assert script_new not in d.scripts, "Target script already defined in dictionary" root_old = d.tables.root(script_old) is_root = ds.get_values(script_old, 'is_root') is_root = is_root and is_root[0][0].lower() == 't' root_new_cand = set() for ss in script_new.singular_sequences: try: root_new_cand.add(d.tables.root(ss)) except KeyError: if not is_root: raise ValueError( "A non root paradigm is defined over singular sequences that are in no paradigms" ) assert len( root_new_cand ) == 1, "No root paradigms or too many for script {}".format( str(script_new)) root_new = next(iter(root_new_cand)) message = "[dictionary] Update paradigm IEML from {} to {}"\ .format(str(script_old), str(script_new), " / ".join( "{}:{}".format(l, desc.get_values(script_new, l, 'translations')) for l in LANGUAGES)) if is_root: # 1st case: root paradigm assert script_old in script_new, "Can only update a root paradigm to a bigger version of it" # then we can update it to a bigger version of it old_structure = ds.get_values_partial(script_old) # transfers translations and structure with self.gitdb.commit(self.signature, message): if is_root: db.remove_structure(script_old) db.add_structure(script_old, 'is_root', 'False') for (_, key), values in old_structure.items(): for v in values: db.add_structure(script_new, key, v) else: db.remove_structure(script_old) db.add_structure(script_new, 'is_root', 'False') db.remove_descriptor(script_old) for (_, l, k), values in desc.get_values_partial(script_old).items(): for v in values: db.add_descriptor(script_new, l, k, v) if is_root: db.add_descriptor( script_old, l, k, '(translation migrated to {}) '.format( str(script_new)) + v)
def test_add_remove(self): # clone two id repository gitA, gitB = init_repo(['/tmp/iemldb_test/A', '/tmp/iemldb_test/B']) # gitA = gitdbs[0] # gitB = gitdbs[1] print("Building DB...") dbA = IEMLDatabase(folder=gitA.folder) dbB = IEMLDatabase(folder=gitB.folder) # commit two differents values ieml = "(a.)" name = 'A' valueA = { 'translations': {'fr': ['test' + name], 'en': ['test' + name]}, 'comments': {'fr': ['test' + name], 'en': ['test' + name]}, 'tags': {'fr': ['test' + name], 'en': ['test' + name]} } commit(gitA, dbA, ieml, 'A', valueA) commit(gitB, dbB, ieml, 'B', valueA) valueB = { 'translations': {'fr': [], 'en': []}, 'comments': {'fr': [], 'en': []}, 'tags': {'fr': [], 'en': []} } # set B as a remote for A gitA.add_remote('B', os.path.join(gitB.folder, '.git')) gitB.add_remote('A', os.path.join(gitA.folder, '.git')) print("Pulling A from B") conflicts = gitA.pull('B') print(conflicts) dbA = IEMLDatabase(folder=gitA.folder) descA = dbA.get_descriptors() self.assertDictEqual(descA.get_descriptor(ieml), valueA) self.assertDictEqual(conflicts, {}) # modify ieml in A, and remove it in B valueA = { 'translations': {'fr': ['test2' + name], 'en': ['test' + name]}, 'comments': {'fr': ['test2' + name], 'en': ['test' + name]}, 'tags': {'fr': ['test2' + name], 'en': ['test' + name]} } commit(gitA, dbA, ieml, 'A', valueA) commit(gitB, dbB, ieml, 'B', valueB) print("Pulling A from B") conflicts = gitA.pull('B') print(conflicts) dbA = IEMLDatabase(folder=gitA.folder) descA = dbA.get_descriptors() self.assertDictEqual(descA.get_descriptor(ieml), valueB) self.assertDictEqual(conflicts[ieml], valueA) print("Pulling B from A") conflicts = gitB.pull('A') print(conflicts) dbB = IEMLDatabase(folder=gitB.folder) descB = dbB.get_descriptors() self.assertDictEqual(descB.get_descriptor(ieml), valueB) self.assertDictEqual(conflicts, {})
doc.packages.append(Package('xcolor', ['dvipsnames', 'table'])) try: doc.generate_pdf(clean_tex=False, silent=False) doc.generate_tex() except subprocess.CalledProcessError as e: os.chdir( old_cwd) # because pylatex change it but doesnt restore it raise e with open(path + '.pdf', 'rb') as fp: return fp.read() def rendex_latex_word(w: Word, descriptors: Descriptors, language: LANGUAGES): return compile_latex(word_to_latex(w, descriptors, language)) if __name__ == "__main__": gitdb = GitInterface() db = IEMLDatabase(gitdb.folder) ieml = "[E:T:. (E:.b.wa.- E:.-wa.-t.o.-' E:.-'we.-S:.-'t.o.-',)(e.) > E:.n.- (E:.wo.- E:S:.-d.u.-') > E:.d.- (E:.wo.- E:S:.-d.u.-')(m.-S:.U:.-') > ! E:.n.- E:U:. ()]" w = usl(ieml) res = rendex_latex_word(w, db.get_descriptors(), 'en') with open("output.pdf", 'wb') as fp: fp.write(res)
class IemlData: """ Modify the ieml data to output a vector representation """ def __init__(self, input_database_folder_path, out_file_path=None): from ieml.ieml_database import IEMLDatabase # input file self.database = IEMLDatabase(folder=input_database_folder_path) # output file if out_file_path is not None: with open(out_file_path, "w") as output_file: output_file.write("") self.vocab_file_path = "{0}.vocab".format( out_file_path.replace(".tsv", "").replace(".csv", "")) self.vocab_file = open(self.vocab_file_path, "a") self.out_file_path = out_file_path self.out_file = open(out_file_path, "a") def close_all(self): try: self.vocab_file.close() except AttributeError: pass try: self.out_file.close() except AttributeError: pass def get_word_objects(self): return self.database.list(parse=False, type='word') def list_polymorpheme_of_word(self, w): ########WORkAROUND############TO BE SOLVED THEN REMOVE########################################3 if w == "[! E:B:. ()(k.a.-k.a.-' l.o.-k.o.-') > E:.f.- ()(p.E:A:T:.-)] [>role>E:B:.>content>constant>k.a.-k.a.-'": return [] ################################################################## w = usl(w) assert isinstance(w, Word) polyList = [] for sfun in w.syntagmatic_fun.actors.values(): if sfun.actor is not None: polyList.append((sfun.actor.pm_content, sfun.actor.pm_flexion)) # return list(chain.from_iterable((sfun.actor.pm_content, sfun.actor.pm_flexion) # for sfun in w.syntagmatic_fun.actors.values())) # encounteres AttributeError: 'NoneType' object has no attribute 'pm_content' since sfun.actor can be None return polyList def get_natural_lang_meanings(self, lang="en"): nl_meanings = [] descriptors = self.database.get_descriptors() for word in self.get_word_objects(): word_nl_meanings = [] # get meaning of word desc_w_vals = descriptors.get_values_partial(word) for (usl_w, language_w, label_w), tr_w_list in desc_w_vals.items(): if language_w == lang and label_w == "translations": word_nl_meanings.append([" , ".join(tr_w_list)]) # divide the words form the polymorphemes word_nl_meanings.append([" : "]) # get meaning of polymorpheme polymorphemes = self.list_polymorpheme_of_word(word) for polymorph in polymorphemes: for poly in polymorph: desc_p_vals = descriptors.get_values_partial(poly) for (usl_p, language_p, label_p), tr_p_list in desc_p_vals.items(): if language_p == lang and label_p == "translations": word_nl_meanings.append(tr_p_list) nl_meanings.append(word_nl_meanings) return nl_meanings def get_bert_emb(self, string, bert_class): bert_class = bert_class if bert_class is not None else BertEmbedd() return bert_class.bert([string]) def make_bert_emb_list(self, lang="en", bert_class=None, dump=False): bert_class = bert_class if bert_class is not None else BertEmbedd() bert_embeddings = [] for ieml_pm_in_nl in self.get_natural_lang_meanings(lang): ieml_w_pm_sent = " ".join( [" ".join(pm) for pm in ieml_pm_in_nl if len(pm) != 0]) # yield self.get_bert_emb(ieml_pm_in_nl, bert_class) bert_embeddings.append( self.get_bert_emb(ieml_w_pm_sent, bert_class)) # dump ieml the sentence embeddings if dump is not False: # dump the embeddings for bert_emb in bert_embeddings: for (bert_vocab, bert_vect) in bert_emb: bert_vect = np.array(bert_vect) self.vocab_file.write("{0}\n".format( json.dumps(bert_vocab))) try: self.out_file.write("{0}\n".format( json.dumps(bert_vect))) except TypeError: self.out_file.write("{0}\n".format(bert_vect.dumps())) # numpy.save(self.out_file, sent_emb) self.close_all() return bert_embeddings