def add_row_(_s): """M:M:.-O:M:.+M:O:.-E:.-+s.y.-' S:E:.+M:M:.-E:.+O:M:.+M:O:.-E:.-+s.y.-'""" s, a, _m = _s.children # return m(m(script("S:E:.") + s.children[0]), m(script("E:.") + a.children[0]), _m) res = [] for ss_m in _m.singular_sequences: for ss_s in s.children[0].singular_sequences: res.append([ m(m(ss_s), m(script("E:.")), ss_m), m(m(ss_s), m(a.children[0]), ss_m) ]) for ss_a in a.children[0].singular_sequences: res.append([ m(m(script("S:E:.")), m(ss_a), ss_m), m(m(s.children[0]), m(ss_a), ss_m) ]) res.append([ m(m(script("S:E:.")), m(script("E:.")), _m), m(m(s.children[0]), m(a.children[0]), _m) ]) return res
def add_row_evolution_culturel(): """ M:O:.-'F:.-'k.o.-t.o.-', => M:O:.-+S:.-'F:.-'k.o.-t.o.-', :return: """ src = script("M:O:.-'F:.-'k.o.-t.o.-',") s, a, _m = src tgt = m(s + script("S:.-'"), a, _m) to_update = {} to_remove = [] to_update[src] = tgt # for ss_s in s.singular_sequences: # to_update[m(ss_s, a, _m)] = m(ss_s, a, _m) for ss_a in a.singular_sequences: to_update[m(s, ss_a, _m)] = m(script("S:.-'"), ss_a, _m) # to_update[m(s, ss_a, _m)] = m(s + script("S:.-'"), ss_a, _m) # to_add.append() # # to_remove.append() # # res.append(tgt) # for r in tgt.singular_sequences: return to_update, to_remove
def translate_ocean(s): if s in script("M:.-'M:.-'n.-T:.A:.-',"): return _rotate_sc(s) elif s in script("s.-S:.U:.-'l.-S:.O:.-'n.-T:.A:.-',"): subst, attr, mode = s return m(subst, mode, attr) elif isinstance(s, AdditiveScript): return AdditiveScript(children=[translate_ocean(c) for c in s])
def migrate_EOETI(scri): "E:.O:.E:T:.+I:.- => O:.E:T:.+I:.-" if not set(scri.singular_sequences).issubset( script("E:.O:.E:T:.+I:.-").singular_sequences_set): return scri s, a, _m = scri.children return m(script('S:.'), a, _m)
def normalize_dictionary_file(file, expand_root=False): with open(file) as fp: d = yaml.load(fp) d['Semes'] = d['Semes'] if 'Semes' in d and d['Semes'] else [] d['Paradigms'] = d[ 'Paradigms'] if 'Paradigms' in d and d['Paradigms'] else [] d['RootParadigm']['inhibitions'] = d['RootParadigm']['inhibitions'] \ if 'inhibitions' in d['RootParadigm'] and d['RootParadigm']['inhibitions'] else [] script_root = script(d['RootParadigm']['ieml']) semes_root = {str(ss): ss for ss in script_root.singular_sequences} semes_file = {ss['ieml']: ss for ss in d['Semes']} paradigms_file = {p['ieml']: p for p in d['Paradigms']} if expand_root: # remove extra semes to_remove = set() for ss in d['Semes']: if ss['ieml'] not in semes_root: to_remove.add(ss['ieml']) d['Semes'] = [ss for ss in d['Semes'] if ss['ieml'] not in to_remove] # add missing semes for ss in set(semes_root) - set(semes_file): d['Semes'].append({ 'ieml': ss, 'translations': { 'fr': "", 'en': "" } }) # add table set paradigms table_root = table_class(script_root)(script_root, None) if isinstance(table_root, TableSet): paradigms = {str(t): t for t in table_root.tables} # add missing tables for ss in set(paradigms) - set(paradigms_file): d['Paradigms'].append({ 'ieml': ss, 'translations': { 'fr': "", 'en': "" } }) d['Semes'] = sorted(d['Semes'], key=lambda ss: semes_root[ss['ieml']]) d['Paradigms'] = sorted(d['Paradigms'], key=lambda ss: script(ss['ieml'])) r = _serialize_root_paradigm(d['RootParadigm'], d['RootParadigm']['inhibitions'], d['Semes'], d['Paradigms']) return r
def _serialize_character_serie(char_serie): mrph = { 'id': True, 'ieml': str(word([script(grp['words'], promote=True) for grp in char_serie['groups']] + \ [script(w) for w in char_serie['constants']['words']] if char_serie['constants'] else [])), 'descriptors' : {'fr': [char_serie['name']], 'en': []} } return _serialize_word(mrph)
def migrate_sOOF(scri): "s.O:O:.-F:.-' => s.o.-O:O:.-'F:.-'," if not set(scri.singular_sequences).issubset( script("s.O:O:.-F:.-'").singular_sequences_set): return scri s, a, _ = scri.children ss, sa, _ = s.children return m(m(m(ss, script('o.')), m(sa)), m(a))
def translate_script(to_translate): """ translate the root paradigms in key in argument, with the function in value :param to_translate: :return: """ version = DictionaryVersion(latest_dictionary_version()) version.load() to_remove = [] to_add = { 'terms': [], 'roots': [], 'inhibitions': {}, 'translations': {l: {} for l in LANGUAGES} } for root, func in to_translate.items(): root = script(root) terms = list(filter(lambda s: s in root, map(script, version.terms))) new_root = func(root) new_terms = [func(s) for s in terms] to_add['terms'].extend(map(str, new_terms)) to_add['roots'].append(str(new_root)) to_add['inhibitions'].update({str(new_root): version.inhibitions[root]}) for l in LANGUAGES: to_add['translations'][l].update({str(func(s)): version.translations[l][s] for s in terms}) to_remove.extend(map(str, terms)) return create_dictionary_version(version, add=to_add, remove=to_remove)
def _add_mode_t(s): """ O:O:.O:O:.- => O:O:.O:O:.t.- """ subst, attr, mode = s assert isinstance(mode, NullScript) return m(subst, attr, script('t.'))
def __init__(self, paradigms, structure): scripts = { s: script(s, factorize=False) for s in tqdm(paradigms, "Loading dictionary") } root_paradigms = [] inhibitions = defaultdict(list) ignored = [] for (p, key), (value, ) in structure.df.iterrows(): if p not in scripts: # print(p) continue p = scripts[p] for ss in p.singular_sequences: scripts[str(ss)] = ss if key == 'inhibition': inhibitions[p].append(value) elif key == 'is_root' and value[0].lower() == 't': root_paradigms.append(p) elif key == 'is_ignored' and value[0].lower() == 't': ignored.append(p) # ignore all scripts that are not in a root paradigm singular_sequences = set() for r in root_paradigms: if any(ss in singular_sequences for ss in r.singular_sequences): raise ValueError("Root paradigms overlap with {}".format( str(r))) singular_sequences |= r.singular_sequences_set for s in scripts.values(): if not s.singular_sequences_set.issubset(singular_sequences): ignored.append(s) for s in ignored: del scripts[str(s)] if s in root_paradigms: root_paradigms.remove(s) if s in inhibitions: del inhibitions[s] # map of root paradigm script -> inhibitions list values self._inhibitions = inhibitions self.scripts = np.array(sorted(scripts.values())) self.tables = TableStructure(self.scripts, root_paradigms) self.scripts = np.array([ s for s in self.scripts if len(s) == 1 or s in self.tables.tables ]) self.index = {e: i for i, e in enumerate(self.scripts)} self.roots_idx = np.zeros((len(self.scripts), ), dtype=int) self.roots_idx[[self.index[r] for r in root_paradigms]] = 1 self.relations = RelationsGraph(dictionary=self)
def index_of(self, item): if self._index is None: self._index = { t.script: index for index, t in np.ndenumerate(self.cells) } return self._index[script(item)]
def __init__(self, scripts: List[str], root_paradigms: List[str], translations: Dict[str, Dict[str, str]], inhibitions: Dict[str, List[str]], comments: Dict[str, Dict[str, str]]): self.scripts = np.array(sorted(script(s) for s in scripts)) self.index = {e: i for i, e in enumerate(self.scripts)} # list of root paradigms self.roots_idx = np.zeros((len(self.scripts),), dtype=int) self.roots_idx[[self.index[r] for r in root_paradigms]] = 1 # scripts to translations self.translations = {s: Translations(fr=translations['fr'][s], en=translations['en'][s]) for s in self.scripts} # scripts to translations self.comments = {s: Comments(fr=comments['fr'][s] if s in comments['fr'] else '', en=comments['en'][s] if s in comments['en'] else '') for s in self.scripts} # map of root paradigm script -> inhibitions list values self._inhibitions = inhibitions # self.tables = TableStructure self.tables = TableStructure(self.scripts, self.roots_idx) self.relations = RelationsGraph(dictionary=self)
def add_empty_science_humaine(): """ M:M:.-O:M:.+M:O:.-E:.-+s.y.-‘ —> M:M:.-O:M:.+M:O:.-E:.-+s.y.-‘ + M:O:.-we.-s.y.-' + M:M:.-we.-s.y.-' """ src = script("M:M:.-O:M:.+M:O:.-E:.-+s.y.-'") tgt = script( "M:M:.-O:M:.+M:O:.-E:.-+s.y.-'+M:O:.-we.-s.y.-'+M:M:.-we.-s.y.-'") to_update = {} to_remove = [] to_update[src] = tgt to_add = [script("M:O:.-we.-s.y.-'"), script("M:M:.-we.-s.y.-'")] return to_update, to_add, to_remove
def p_morpheme(self, p): """morpheme : MORPHEME""" morpheme = script(p[1], factorize=self.factorize_script) if self.dictionary is not None and morpheme not in self.dictionary: raise ValueError( "Morpheme {} not defined in dictionary".format(morpheme)) p[0] = morpheme
def migrate_EMOto(scri): " E:M:O:.t.o.- => E:M:.-O:.-t.o.-'" if not set(scri.singular_sequences).issubset( script("E:M:O:.t.o.-").singular_sequences_set): return scri s, a, _m = scri.children ss, sa, sm = s.children return m(m(m(ss, sa)), m(m(sm)), m(a, _m))
def translate_temps(s): """ t.o. - n.o. - 'M:O:.-', """ root = script("t.o.-n.o.-M: O:.-'") if not s.singular_sequences_set.issubset(root.singular_sequences): return s _s, _a, _m = s return m(m(_s, _a), m(_m))
def test_symmetry(self): t = script('wa.') r = self.dictionary.relations.relation_object(t) for reltype in RELATIONS: for tt in r[reltype]: if t not in self.dictionary.relations.object( tt, INVERSE_RELATIONS[reltype]): self.fail( 'Missing link "%s" --> "%s" (%s) in relations db.' % (str(tt), str(t), reltype))
def migrate_EOOMt0(scri): "E:.-'O:O:.-M:.t.o.-', => E:.-'O:O:.-M:.-'t.o.-'," if not set(scri.singular_sequences).issubset( script("E:.-'O:O:.-M:.t.o.-',").singular_sequences_set): return scri s, a, _ = scri.children _as, aa, _ = a.children aas, aaa, aam = aa.children return m(s, m(_as, m(aas)), m(m(aaa, aam)))
def p_polymorpheme_path(self, p): """polymorpheme_path : POLYMORPHEME_POSITION | POLYMORPHEME_POSITION MULTIPLICITY | POLYMORPHEME_POSITION SEPARATOR MORPHEME | POLYMORPHEME_POSITION MULTIPLICITY SEPARATOR MORPHEME""" if len(p) == 2: group_idx = GroupIndex[p[1].upper()] p[0] = PolymorphemePath(group_idx, multiplicity=1) elif len(p) == 3: group_idx = GroupIndex[p[1].upper()] p[0] = PolymorphemePath(group_idx, multiplicity=int(p[2])) elif len(p) == 4: group_idx = GroupIndex[p[1].upper()] morpheme = script(p[3]) p[0] = PolymorphemePath(group_idx, morpheme, multiplicity=1) else: group_idx = GroupIndex[p[1].upper()] morpheme = script(p[4]) p[0] = PolymorphemePath(group_idx, morpheme, multiplicity=int(p[2]))
def p_role_path_list(self, p): """role_path_list : role_path_list MORPHEME | MORPHEME | role_path_list ROLE_NAME | ROLE_NAME""" if len(p) == 2: s = p[1] if s in ROLE_NAMES_TO_SCRIPT: s = ROLE_NAMES_TO_SCRIPT[s] else: s = script(s) p[0] = [s] else: s = p[2] if s in ROLE_NAMES_TO_SCRIPT: s = ROLE_NAMES_TO_SCRIPT[s] else: s = script(s) p[0] = p[1] + [s]
def _character(semes: List[Union[Script, str]]): if not semes: return [script('E:')] try: _semes = [script(e) for e in semes] except TypeError: raise InvalidIEMLObjectArgument(Character, "The root argument %s is not an iterable" % str(semes)) if len(_semes) > CHARACTER_SIZE_LIMIT: raise InvalidIEMLObjectArgument(Character, "Invalid semes count %d, must be lower or equal than %d." % (len(_semes), CHARACTER_SIZE_LIMIT)) if any(not isinstance(c, Script) for c in _semes): raise InvalidIEMLObjectArgument(Character, "The children of a Topic must be a Word instance.") singular_sequences = [s for t in _semes for s in t.singular_sequences if not s.empty] if len(singular_sequences) != len(set(s for s in singular_sequences if not s.empty)): raise InvalidIEMLObjectArgument(Character, "Singular sequences intersection in %s." % str([str(t) for t in _semes])) return _semes
def p_word(self, p): """word : TERM | LBRACKET TERM RBRACKET | LBRACKET TERM RBRACKET literal_list""" try: term = script(p[1 if len(p) == 2 else 2]) except TermNotFoundInDictionary as e: raise CannotParse(self._ieml, str(e)) if len(p) == 5: logging.error( "Literals not supported on script for the moments, and are ignored." ) p[0] = term else: p[0] = term
def migrate_tidi(s): """ t.i.-d.i.-t.+M:O:.-' t.i.-d.i.-'t.+M:O:.-', :param s: :return: """ root = script("t.i.-d.i.-t.+M:O:.-'") if not s.singular_sequences_set.issubset(root.singular_sequences_set): return s _s, a, _m = s return m(m(_s, a), m(_m))
def ignore_body_parts(gitdb, db): root = script( "f.o.-f.o.-',n.i.-f.i.-',M:O:.-O:.-',_+f.o.-f.o.-'E:.-U:.S:+B:T:.-l.-',E:.-U:.M:T:.-l.-'E:.-A:.M:T:.-l.-',_" ) to_ignore = [] for ss in db.list('morpheme', paradigm=False, parse=True): if ss in root: to_ignore.append(ss) for p in db.list('morpheme', paradigm=True, parse=True): if set(root.singular_sequences).issuperset(p.singular_sequences): to_ignore.append(p) print(len(to_ignore)) with gitdb.commit( pygit2.Signature("Louis van Beurden", '*****@*****.**'), "[Ignore] ignore body part root paradigm"): for s in to_ignore: db.add_structure(s, 'is_ignored', True)
def p_flexion_path(self, p): """flexion_path : MORPHEME """ p[0] = FlexionPath(script(p[1]))
def __getitem__(self, item): return self.scripts[self.index[script(item)]]
def set_bSU_subst(s): subst, attr, mode = s return m(script("b.-S:.U:.-'"), attr, mode)
# _up, _rem = get_competence_en_curr_data_diff() # update = {'terms': { # **{str(s.script): str(translate_mouvements_et_milieux(s.script)) for s in term("i.f.B:.-+u.f.M:.-O:.-'").relations.contains}, # **_up # }, # 'remove': { # *_rem # } # } # print('\n'.join("{} => {}".format(a, b) for a, b in _up.items())) # version = create_dictionary_version(latest_dictionary_version(), update=update, remove=_rem) # upload_to_s3(version) # print(version) root = "O:.M:.-M:.-'" translator = translate_ecosystem_intl_col_tern "dictionary_2018-06-08_17:07:06" print(str(translator(script(root)))) d = Dictionary("dictionary_2018-06-08_17:07:06") # translate_update("s.u.-'O:M:.-'O:.-',+s.u.-'M:O:.-O:.-'M:.-',", translate_formes_visuelles) diff = { **{str(s.script): str(translate_competence_en_curr_data(s.script)) for s in term("M:.-O:.-'M:.-wa.e.-'t.-x.-s.y.-',", d).relations.contains}, # **{str(s.script): str(translate_ecosystem_intl_col_tern(s.script)) for s in term("O:.M:.-M:.-'", d).relations.contains}, } import json print(json.dumps(diff, indent=True)) version = create_dictionary_version(None, diff=diff) # upload_to_s3(version) print(version)
def translate_tisse_intl_col(s): """O:M:.-O:M:.-we.h.-' => O:M:.-'O:M:.-'s.o.-k.o.-',""" subst, attr, mode = s return m(m(subst), m(attr), script("s.o.-k.o.-'"))
def get_script(wid): res = usls_c.find_one({'_id': wid}) assert res, "Invalid id " + str(wid) return script(migrate_morpheme(res['script']))
def translate_noetic(s): """M:.O:.-O:.O:.-B:.T:.n.-' => s.M:O:.O:O:.-""" subst, attr, mode = s return m(script('s.'), m(subst.children[0].children[0], subst.children[1].children[0]), m(attr.children[0].children[0], attr.children[1].children[0]))
def import_old_series(lex, desc, dictionary): from pymongo import MongoClient db_mongo = MongoClient()['intlekt'] ms_c = db_mongo['morphemes_series'] usls_c = db_mongo['usl'] def get_script(wid): res = usls_c.find_one({'_id': wid}) assert res, "Invalid id " + str(wid) return script(migrate_morpheme(res['script'])) def get_word(wid): res = usls_c.find_one({'_id': wid}) return res for ms in ms_c.find(): print(ms['name'], ms['_id']) if 'constants' in ms: constant = list(map(get_script, ms['constants']['words'])) else: constant = [] groups = [] for g in ms['groups']: mult = g['multiplicity'] if mult is None: mult = 1 groups.append((tuple(map(get_script, g['words'])), mult)) ms_n = PolyMorpheme(constant=constant, groups=groups) _break = False for morph in chain.from_iterable([constant, *(g[0] for g in groups)]): if morph not in dictionary.scripts: print("Error in {}, script not defined {}".format( str(ms_n), str(morph)), file=sys.stderr) _break = True if _break: continue domain = DOMAINS[ms['name'].split(':', 1)[0]] lex.add_paradigm(ms_n, domain=domain) desc.set_value(ms_n, 'fr', 'translations', [ms['name'].split(':', 1)[1].strip()]) for ms_ss in ms['morphemes_cache']['_value']['morphemes']: constant = [script(migrate_morpheme(w)) for w in ms_ss['words']] if len(constant) == 1: continue ms_ss_n = PolyMorpheme(constant=constant) _break = False for morph in chain.from_iterable( [ms_ss_n.constant, *(g[0] for g in ms_ss_n.groups)]): if morph not in dictionary.scripts: print("Error in {}, script not defined {}".format( str(ms_ss_n), str(morph)), file=sys.stderr) _break = True if _break: break # print(ms_ss) # print(str(ms_ss_n)) if 'id' in ms_ss: w = get_word(ms_ss['id']) if not w: print("Error in {}, word not defined {}".format( str(ms_ss_n), str(ms_ss['id'])), file=sys.stderr) continue if 'descriptors' in w: if 'fr' in w['descriptors']: desc.set_value( ms_ss_n, 'fr', 'translations', list( chain.from_iterable( (dd.split(',') for dd in w['descriptors']['fr'])))) if 'en' in w['descriptors']: desc.set_value( ms_ss_n, 'en', 'translations', list( chain.from_iterable( (dd.split(',') for dd in w['descriptors']['en']))))
def _fix_typo(s): """M:.-O:.-'M:.-wa.e.-'t.x.-s.y.-', => M:.-O:.-'M:.-wa.e.-'t.-x.-s.y.-',""" subst, attr, mode = s return m(subst, attr, script("t.-x.-s.y.-'"))
def _insert_attr_f(s): """O:O:.F:.- => O:O:.f.F:.-""" subst, attr, mode = s assert isinstance(mode, NullScript) return m(subst, script('f.'), attr)
def translate_ecosystem_intl_col(s): """O:.M:.- => s.o.-k.o.-'M:O:.-',""" subst, attr, mode = s return m(script("s.o.-k.o.-'"), m(m(m(attr.children[0], subst.children[0]))))
from collections import defaultdict import re from itertools import chain, count from typing import List, Set from ieml.constants import AUXILIARY_CLASS from ieml.dictionary.script import script, Script SYNTAGMATIC_FUNCTION_PROCESS_TYPE_SCRIPT = script('E:.b.E:S:.-') SYNTAGMATIC_FUNCTION_ACTANT_TYPE_SCRIPT = script('E:.b.E:B:.-') SYNTAGMATIC_FUNCTION_QUALITY_TYPE_SCRIPT = script('E:.b.E:T:.-') # Process : grammatical role (valence) ONE_ACTANT_PROCESS = script('E:S:.') TWO_ACTANTS_PROCESS = script('E:T:.') THREE_ACTANTS_PROCESS = script('E:B:.') ADDRESS_PROCESS_VALENCE_SCRIPTS = [ ONE_ACTANT_PROCESS, TWO_ACTANTS_PROCESS, THREE_ACTANTS_PROCESS ] # process # Process : mandatory address ADDRESS_PROCESS_VOICES_SCRIPTS = { ONE_ACTANT_PROCESS: { script("E:.-wa.-t.o.-'"), # Actif script("E:.-wo.-t.o.-'"), }, # Passif TWO_ACTANTS_PROCESS: script("E:.-O:O:.-t.o.-'").singular_sequences_set, THREE_ACTANTS_PROCESS: script("E:.-O:O:.-t.o.-'").singular_sequences_set } assert all(e in ADDRESS_PROCESS_VOICES_SCRIPTS[THREE_ACTANTS_PROCESS] for e in ADDRESS_PROCESS_VOICES_SCRIPTS[ONE_ACTANT_PROCESS])
""" root = script("t.i.-d.i.-t.+M:O:.-'") if not s.singular_sequences_set.issubset(root.singular_sequences_set): return s _s, a, _m = s return m(m(_s, a), m(_m)) # TODO : # M:M:.-O:M:.+M:O:.-E:.-+s.y.-‘ => if __name__ == '__main__': migrate(migrate_tidi, script("t.i.-d.i.-t.+M:O:.-'"), script("t.i.-d.i.-'t.+M:O:.-',")) # assert migrate_EOETI(script("E:.O:.E:T:.+I:.-")) == script("S:.O:.E:T:.+I:.-") # # folder = '/tmp/migrate_script_iemldb' # if os.path.isdir(folder): # shutil.rmtree(folder) # # os.mkdir(folder) # git_address = "https://github.com/IEMLdev/ieml-language.git" # # credentials = pygit2.Keypair('ogrergo', '~/.ssh/id_rsa.pub', '~/.ssh/id_rsa', None) # gitdb = GitInterface(origin=git_address, # credentials=credentials, # folder=folder)