def get_list(self): res = {} dictionary = self.get_dictionary() parser = IEMLParser(dictionary=dictionary) for (ieml, lang, desc), (v, ) in tqdm( self.get_descriptors().df.iterrows(), "List all descriptors at {}".format(self.folder)): if ieml not in res: try: pieml = parser.parse(ieml) except CannotParse: continue assert str(pieml) == ieml i, r = get_index(pieml, dictionary) if i == 0 and isinstance(pieml, PolyMorpheme): pieml = pieml.constant[0] res[ieml] = {'ieml': str(pieml), 'type': TYPES[i], 'paradigm': len(pieml) != 1, 'class': GRAMMATICAL_CLASS_NAMES[pieml.grammatical_class].lower().capitalize(), 'index': r, 'cardinality': 'singular_sequence' if pieml.cardinal == 1 else \ ('paradigm' if not isinstance(pieml, Script) or pieml not in dictionary.tables.roots else 'root_paradigm'), 'domains': [] } if desc not in res[ieml]: res[ieml][desc] = {l: [] for l in LANGUAGES} res[ieml][desc][lang].append(v) return sorted(res.values(), key=lambda e: e['index'])
def test_parse_flexion(self): p = "E:S:.-U:.-t.o.-' [>E:S:.-U:.-t.o.-' \"\"]" parser = IEMLParser() res = parser.parse(p) self.assertIsInstance(res, InstancedUSL) self.assertIsInstance(res.usl, PolyMorpheme) self.assertTrue(len(res.decorations), 1) self.assertIsInstance(res.decorations[0].path, FlexionPath) self.assertEqual(res.decorations[0].value, '')
def test_parse_empty_path(self): p = "T: l.-T:.U:.-',n.-T:.A:.-',t.o.-f.o.-',_ m1(S:.E:A:T:.- T:.E:A:S:.-) m1(p.E:S:B:.- s.-S:.U:.-') [>group_1>s.-S:.U:.-' \"\"]" pathparser = IEMLParser() res = pathparser.parse(p) self.assertIsInstance(res, InstancedUSL) self.assertIsInstance(res.usl, PolyMorpheme) self.assertTrue(len(res.decorations), 1) self.assertEqual(res.decorations[0].value, '') path = PathParser().parse(">group_1>s.-S:.U:.-'") self.assertIsInstance(path, PolymorphemePath) self.assertEqual(res.decorations[0].path, path)
def _process_line(self, lines_iter, parse=False): if parse: parser = IEMLParser(dictionary=self.get_dictionary()) for l in lines_iter: if not l.strip(): continue l = l.strip().decode('utf8') if parse: return parser.parse(l) else: return l
def path_of(self, _ieml, descriptor=True, mkdir=False, normalize=True): if isinstance(_ieml, str): ieml = IEMLParser().parse(_ieml) else: ieml = _ieml if descriptor: ext = '.desc' else: ext = '.ieml' if isinstance(ieml, InstancedUSL): class_folder, prefix_sixe = self.CLASS_TO_FOLDER[ ieml.usl.__class__] else: class_folder, prefix_sixe = self.CLASS_TO_FOLDER[ieml.__class__] if normalize: filename = self.filename_of(ieml) else: filename = self.filename_of(_ieml) prefix = filename[:prefix_sixe] p = os.path.join(self.folder, class_folder, 'singular' if len(ieml) == 1 else 'paradigm', prefix) if mkdir: os.makedirs(p, exist_ok=True) return os.path.join(p, filename + ext)
def list( self, type=None, paradigm=None, parse=False, ): p = self.folder if type: if not isinstance(type, str): type = type.__name__.lower() p = os.path.join(p, type) if paradigm is not None: p = os.path.join(p, 'paradigm' if paradigm else 'singular') p1 = subprocess.Popen("find -path *.desc -print0".split(), stdout=subprocess.PIPE, cwd=p) p2 = subprocess.Popen("xargs -0 cat".split(), stdin=p1.stdout, stdout=subprocess.PIPE, cwd=p) p3 = subprocess.Popen(["cut", "-f2", '-d', '"'], stdin=p2.stdout, stdout=subprocess.PIPE, cwd=p) p4 = subprocess.Popen(["uniq"], stdin=p3.stdout, stdout=subprocess.PIPE, cwd=p) res = [ s.strip().decode('utf8') for s in p4.stdout.readlines() if s.strip() ] if parse: parser = IEMLParser(dictionary=self.get_dictionary()) _res = [] for s in res: try: _res.append(parser.parse(s)) except CannotParse as e: error("Cannot parse {} : {}".format(s, repr(e))) return _res return res
def usl_from_path_values(paths_values): from ieml.usl.decoration.parser.parser import PathParser from ieml.usl.parser import IEMLParser path_parser = PathParser() usl_parser = IEMLParser() path_to_value = {path_parser.parse(p): set() for p, _ in paths_values} for p, v in paths_values: path_to_value[path_parser.parse(p)].add(usl_parser.parse(v)) Tree = lambda: defaultdict(Tree) bins = Tree() def recursive_group_by(bin, path, values): p_cloned = path.no_child_clone() if 'type' in bin: if not isinstance(path, bin['type']): raise ValueError("Inconsistent path system") else: bin['type'] = path.__class__ if path.child is None: bin[p_cloned]["node"] = values else: recursive_group_by(bin[p_cloned], path.child, values) def build_nodes(bin): if 'node' not in bin: path_to_node = {} for p, bin_child in bin.items(): if isinstance(p, UslPath): path_to_node[p] = build_nodes(bin_child) assert 'type' in bin bin['node'] = bin['type'].build_usl_from_path_to_node(path_to_node) return bin['node'] for p, values in path_to_value.items(): recursive_group_by(bins, p, list(values)) return build_nodes(bins)
def test_invalid_cannot_parse_polymorpheme(self): POLYMORPH = [ "U: wa. m1()", # "m1(wo. wa.)m1(U:)", # "m4(U: S: E: T:)", ] for _t in POLYMORPH: # assert str(t) == str(_t), "{} != {}".format(str(t), str(_t)) with self.assertRaises(CannotParse): t = IEMLParser().parse(_t)
def migrate(database, out_folder): descriptors = database.descriptors() dictionary = database.dictionary_structure() # 'root', 'paradigms', 'inhibitions' shutil.rmtree(out_folder + '/descriptors') shutil.rmtree(out_folder + '/structure') # os.rmdir(out_folder) # os.mkdir(out_folder) db2 = IEMLDatabase(out_folder) # db2.get_csv() if not os.path.isdir(out_folder): os.mkdir(out_folder) for ieml, (paradigms, inhibitions) in tqdm.tqdm(dictionary.structure.iterrows(), 'migrating structure'): l = IEMLParser().parse(ieml, factorize_script=True) db2.add_structure(str(l), 'is_root', True) for i in inhibitions: db2.add_structure(str(l), 'inhibition', i) all_db = defaultdict(lambda: defaultdict(dict)) for (ieml, lang, desc), (v) in descriptors: all_db[ieml][(lang, desc)] = v.values[0] for ieml, dd in tqdm.tqdm(all_db.items(), 'migrating descriptors'): l = IEMLParser().parse(ieml, factorize_script=True) path = db2.path_of(l) os.makedirs('/'.join(path.split('/')[:-1]), exist_ok=True) with open(path, 'w') as fp: for (lang, desc), v in dd.items(): for vv in v: fp.write('"{}" {} {} "{}"\n'.format( str(l), lang, desc, db2.escape_value(vv)))
def normalize_key(ieml, key, value, parse_ieml=False, partial=False, structure=False): if not (partial or (ieml is not None and key and (structure or value))): raise ValueError("IEML and Key can't be null") if ieml: ieml = str(ieml) if parse_ieml: parsed = IEMLParser().parse(str(ieml)) ieml = str(parsed) # if ieml != str(parsed): # raise ValueError("IEML is not normalized: {}".format(ieml)) # if len(parsed) == 1 and structure: # raise ValueError("Only paradigms can have a structure: {}".format(ieml)) if structure: if key: key = str(key) if key not in STRUCTURE_KEYS: raise ValueError("Unsupported structure key: '{}'".format( str(key))) if value: if key and key == 'inhibition': value = str(value) if value not in INHIBITABLE_RELATIONS: raise ValueError( "Unsupported inhibition: {}".format(value)) if key and key in ['is_root', 'is_ignored']: value = json.loads(str(value).lower()) if not isinstance(value, bool): raise ValueError( "is_root or is_ignored field only accept boolean, not {}" .format(value)) value = str(value) else: if key: key = str(key) if key not in LANGUAGES: raise ValueError("Unsupported language: '{}'".format(str(key))) if value: value = str(value) if value not in DESCRIPTORS_CLASS: raise ValueError("Unsupported descriptor: '{}'".format( str(value))) return ieml, key, value
def test_invalid_cannot_check_polymorpheme(self): POLYMORPH = [ "U: wa. m0(U:)", # "m1(wo. wa.) m1(U:)", "m4(U: S: E: T:)", ] for _t in POLYMORPH: t = IEMLParser().parse(_t) assert isinstance(t, PolyMorpheme) with self.assertRaises(ValueError): check_polymorpheme(t)
def test_usl_from_path_pm(self): structure = [(">constant>b.-S:.A:.-'S:.-'S:.-',", "b.-S:.A:.-'S:.-'S:.-',"), (">constant>k.a.-k.a.-'", "k.a.-k.a.-'"), (">constant", "U:"), (">constant", "E:")] usl_parser = IEMLParser().parse path_parser = PathParser().parse structure = [(path_parser(p), usl_parser(u)) for p, u in structure] u = usl_from_path_values(structure) self.assertEqual(str(u), "U: k.a.-k.a.-' b.-S:.A:.-'S:.-'S:.-',")
def test_usl_from_path_flexion_paradigm(self): structure = [ (">flexion", "E:.wo.U:.-t.o.-'"), (">flexion", "E:.wo.A:.-t.o.-'"), (">content>constant", "U:"), ] usl_parser = IEMLParser().parse path_parser = PathParser().parse structure = [(path_parser(p), usl_parser(u)) for p, u in structure] u = usl_from_path_values(structure) self.assertEqual(str(u), "(m1(E:.wo.U:.-t.o.-' E:.wo.A:.-t.o.-'))(U:)")
def test_polymorpheme(self): POLYMORPH = [ "U: wo. wa.", "U: m2(wo. wa.)", "m1(U:) m1(S:)", "o. m1(U: S:) m2(t. m.)", "o. m2(A: S: B: T:) m2(y. t.)" ] for _t in POLYMORPH: t = IEMLParser().parse(_t) assert str(t) == str(_t), "{} != {}".format(str(t), str(_t)) assert isinstance(t, PolyMorpheme) elems = set() for ss in t.singular_sequences: assert ss.cardinal == 1 ss.check() assert ss not in elems elems.add(ss)
def test_word(self): CHARACTERS = [ "[! E:A:. ()(b.-S:.A:.-'S:.-'S:.-', m1(S: B: T:)) > E:A:. E:A:. ()(k.a.-k.a.-')]", "[! E:S:. (m1(E:.-',b.-S:.U:.-'y.-'U:.-',_ E:.-',b.-S:.U:.-'y.-'A:.-',_))(wa.) > E:.n.- ()(n.i.-s.i.-') > E:.f.- (E:U:S:.)]" ] for c_str in CHARACTERS: c = IEMLParser().parse(c_str) assert isinstance(c, Word) elems = set() for ss in c.singular_sequences: assert ss.cardinal == 1 ss.check() assert ss not in elems elems.add(ss) assert isinstance(ss, Word)
def ieml(arg, dictionary): if isinstance(arg, Usl): return arg if isinstance(arg, str): try: return IEMLParser(dictionary).parse(arg) except CannotParse as e: raise InvalidIEMLObjectArgument(Usl, str(e)) if isinstance(arg, Script): arg = Word(arg) if arg.dictionary_version != dictionary_version: arg.set_dictionary_version(dictionary_version) return arg raise NotImplemented
def test_usl_from_path(self): structure = { ">role>! E:A:.>flexion>E:": "E:", ">role>! E:A:.>content>constant>b.-S:.A:.-'S:.-'S:.-',": "b.-S:.A:.-'S:.-'S:.-',", ">role>E:A:. E:A:.>flexion>E:": "E:", ">role>E:A:. E:A:.>flexion>E:U:T:.": "E:U:T:.", ">role>E:A:. E:A:.>flexion>E:A:T:.": "E:A:T:.", ">role>E:A:. E:A:.>flexion>E:S:T:.": "E:S:T:.", ">role>E:A:. E:A:.>flexion>E:B:T:.": "E:B:T:.", ">role>E:A:. E:A:.>flexion>E:T:T:.": "E:T:T:.", ">role>E:A:. E:A:.>content>constant>k.a.-k.a.-'": "k.a.-k.a.-'" } usl_parser = IEMLParser().parse path_parser = PathParser().parse structure = [(path_parser(p), usl_parser(u)) for p, u in structure.items()] u = usl_from_path_values(structure) self.assertEqual( u, usl("[! E:A:. ()(b.-S:.A:.-'S:.-'S:.-',) > E:A:. E:A:. (m1(E:U:T:. E:A:T:. E:S:T:. E:B:T:. E:T:T:.))(k.a.-k.a.-')]" ))
folder=folder) # gitdb.pull() signature = pygit2.Signature("Louis van Beurden", "*****@*****.**") db = IEMLDatabase(folder=folder, use_cache=False) desc = db.get_descriptors() struct = db.get_structure() to_migrate = {} to_remove = [] parser = IEMLParser(dictionary=db.get_dictionary()) all_db = db.list() # assert "[E:.b.E:B:.- E:S:. ()(a.T:.-) > ! E:.l.- ()(d.i.-l.i.-')]" in all_db for s in TO_REMOVE: to_pass = True try: _s = parser.parse(s) except CannotParse as e: print(str(e)) print("\t", str(s)) to_pass = False else: if s not in all_db: repr("{} not in database".format(s))
def usl( arg: Union[str, Script, USL, Iterable[Tuple['UslPath', Union[USL, Script]]]] ) -> USL: """ Cast argument to an USL type, depending on the argument type. - If argument is a string, it is parsed by ieml.usl.parser.IEMLParser.parse - if argument is a ieml.dictionary.Script, the returned object is a ieml.usl.polymorpheme.PolyMorpheme with the argument as the constant. - if argument is an ieml.usl.usl.USL, the argument is returned - if argument is a list of (ieml.usl.decoration.path.UslPath, ieml.usl.usl.USL) :param arg: :type arg: Union[str, Script, USL, Iterable[Tuple['UslPath', Union[USL, Script]]]] :return: an ieml.usl.usl.USL """ if isinstance(arg, str): from ieml.usl.parser import IEMLParser return IEMLParser().parse(arg) if isinstance(arg, Script): from ieml.usl import PolyMorpheme return PolyMorpheme(constant=[arg]) if isinstance(arg, USL): return arg #if iterable, can be a list of (path, usl) to convert into an usl try: usl_list = list(arg) except TypeError: pass else: if not usl_list: from ieml.usl import PolyMorpheme return PolyMorpheme(constant=[]) from ieml.usl.decoration.path import UslPath, usl_from_path_values if not all( isinstance(u, (USL, Script)) and isinstance(p, UslPath) for p, u in usl_list): raise ValueError( "Invalid iterable of (UslPath, USL) to create an USL from.") return usl_from_path_values(usl_list) # from ieml.lexicon.paths import resolve_ieml_object, path # if isinstance(arg, dict): # # map path -> Ieml_object # return resolve_ieml_object(arg) # if iterable, can be a list of usl to convert into a text # try: # usl_list = list(arg) # except TypeError: # pass # else: # if len(usl_list) == 0: # return usl('E:') # # if all(isinstance(u, USL) for u in usl_list): # if len(usl_list) == 1: # return usl_list[0] # else: # from ieml.lexicon import text # return text(usl_list) # else: # # list of path objects # try: # rules = [(a, b) for a, b in usl_list] # except TypeError: # pass # else: # rules = [(path(a), usl(b)) for a, b in rules] # return resolve_ieml_object(rules) raise NotImplementedError()