Esempio n. 1
0
    def add_morpheme_paradigm(self, script: Script, translations, comments):
        db = IEMLDatabase(folder=self.gitdb.folder,
                          use_cache=self.use_cache,
                          cache_folder=self.cache_folder)
        d = db.get_dictionary()

        script = _check_script(script)
        if len(script) == 1:
            raise ValueError(
                "The script is not a paradigm {}, can't use it to define a paradigm."
                .format(str(script)))

        if script in d.scripts:
            raise ValueError(
                "Script {} already defined in the dictionary".format(
                    str(script)))

        r_cand = set()
        for ss in script.singular_sequences:
            try:
                r_cand.add(d.tables.root(ss))
            except KeyError:
                raise ValueError(
                    "No root paradigms contains this script {}".format(
                        str(script)))

        if len(r_cand) != 1:
            raise ValueError(
                "No root paradigms or too many for script {}".format(
                    str(script)))

        root = next(iter(r_cand))
        descriptors = db.get_descriptors()

        message = "[dictionary] Create paradigm {} ({}) for root paradigm {} ({})"\
            .format(str(script),
                      " / ".join(
                          "{}:{}".format(l, ', '.join(descriptors.get_values(script, l, 'translations'))) for l in LANGUAGES),
                      str(root),
                      " / ".join(
                          "{}:{}".format(l, ', '.join(descriptors.get_values(root, l, 'translations'))) for l in LANGUAGES))

        with self.gitdb.commit(self.signature, message):
            db.remove_descriptor(script)
            db.remove_structure(script)

            db.add_structure(script, 'is_root', False)

            for l in LANGUAGES:
                for v in translations[l]:
                    db.add_descriptor(script,
                                      language=l,
                                      descriptor='translations',
                                      value=v)

                for v in comments[l]:
                    db.add_descriptor(script,
                                      language=l,
                                      descriptor='comments',
                                      value=v)
Esempio n. 2
0
    def delete_morpheme_root_paradigm(self,
                                      script: Script,
                                      empty_descriptors=True):
        db = IEMLDatabase(folder=self.gitdb.folder,
                          use_cache=self.use_cache,
                          cache_folder=self.cache_folder)
        d = db.get_dictionary()
        descriptors = db.get_descriptors()

        script = _check_script(script)
        if script not in d.tables.roots:
            raise ValueError("Script {} is not a root paradigm".format(
                str(script)))

        message = "[dictionary] Remove root paradigm {} ({})"\
                          .format(str(script),
                                  " / ".join("{}:{}".format(l, ', '.join(descriptors.get_values(script, l, 'translations'))) for l in
                                             LANGUAGES))

        with self.gitdb.commit(self.signature, message):
            db.remove_structure(script)

            if empty_descriptors:
                for s in list(d.relations.object(script, 'contains')):
                    db.remove_descriptor(s)
Esempio n. 3
0
    def test_merge_conflict(self):
        # clone two id repository
        gitA, gitB = init_repo(['/tmp/iemldb_test/A', '/tmp/iemldb_test/B'])
        # gitA = gitdbs[0]
        # gitB = gitdbs[1]

        print("Building DB...")
        dbA = IEMLDatabase(folder=gitA.folder)
        dbB = IEMLDatabase(folder=gitB.folder)

        # commit two differents values
        ieml = "(a.)"
        name = 'A'
        valueA = {
            'translations': {'fr': ['test' + name], 'en': ['test' + name]},
            'comments': {'fr': ['test' + name], 'en': ['test' + name]},
            'tags': {'fr': ['test' + name], 'en': ['test' + name]}
        }
        commit(gitA, dbA, ieml, 'A', valueA)

        name = 'B'
        valueB = {
            'translations': {'fr': ['test' + name], 'en': ['test' + name]},
            'comments': {'fr': ['test' + name], 'en': ['test' + name]},
            'tags': {'fr': ['test' + name], 'en': ['test' + name]}
        }
        commit(gitB, dbB, ieml, 'B', valueB)

        # then a same value commit
        ieml2 = '(b.)'
        commit(gitA, dbA, ieml2, 'A', valueB)
        commit(gitB, dbB, ieml2, 'B', valueB)

        # then a commit only on A
        ieml3 = '(s.)'
        commit(gitA, dbA, ieml3, 'A', valueA)


        # set B as a remote for A
        gitA.add_remote('B', os.path.join(gitB.folder, '.git'))

        print("Pulling A from B")
        conflicts = gitA.pull('B')

        print(conflicts)
        dbA = IEMLDatabase(folder=gitA.folder)

        descA = dbA.get_descriptors()
        self.assertDictEqual(descA.get_descriptor(ieml), valueB)
        self.assertDictEqual(conflicts[ieml], valueA)
Esempio n. 4
0
def migrate(function, _s_old, _s_new):
    assert function(_s_old) == _s_new

    folder = '/tmp/migrate_script_iemldb'
    if os.path.isdir(folder):
        shutil.rmtree(folder)
    # os.mkdir(folder)
    git_address = "https://github.com/IEMLdev/ieml-language.git"

    credentials = pygit2.Keypair('ogrergo', '~/.ssh/id_rsa.pub',
                                 '~/.ssh/id_rsa', None)
    gitdb = GitInterface(origin=git_address,
                         credentials=credentials,
                         folder=folder)

    signature = pygit2.Signature("Louis van Beurden",
                                 "*****@*****.**")

    db = IEMLDatabase(folder=folder, use_cache=False)

    to_migrate = {}
    desc = db.get_descriptors()
    struct = db.get_structure()

    for s in db.get_dictionary().scripts:
        s2 = function(s)
        if s2 != s:
            to_migrate[s] = s2

    print(to_migrate)

    with gitdb.commit(
            signature,
            "[Translate script] Translate paradigm from '{}' to '{}".format(
                str(_s_old), str(_s_new))):
        for s_old, s_new in to_migrate.items():
            db.remove_structure(s_old)
            for (_, key), values in struct.get_values_partial(s_old).items():
                for v in values:
                    db.add_structure(s_new, key, v)

            db.remove_descriptor(s_old)
            for (_, lang, d), values in desc.get_values_partial(s_old).items():
                for v in values:
                    db.add_descriptor(s_new, lang, d, v)
Esempio n. 5
0
    def set_descriptors(self, ieml, descriptor, value):

        db = IEMLDatabase(folder=self.gitdb.folder,
                          use_cache=self.use_cache,
                          cache_folder=self.cache_folder)

        ieml = _check_ieml(ieml)
        value = _check_descriptors(value)

        desc = db.get_descriptors()
        old_trans = {
            l: desc.get_values(ieml=ieml, language=l, descriptor=descriptor)
            for l in LANGUAGES
        }

        if all(sorted(value[l]) == sorted(old_trans[l]) for l in LANGUAGES):
            error("No update needed, db already contains {}:{} for {}".format(
                descriptor, json.dumps(value), str(ieml)))
            return False

        # test if after modification there is still at least a descriptor
        if all(not (desc.get_values(ieml=ieml, language=l, descriptor=d
                                    ) if d != descriptor else value[l])
               for l in LANGUAGES for d in DESCRIPTORS_CLASS):
            error('[descriptors] Remove {}'.format(str(ieml)))
            with self.gitdb.commit(self.signature,
                                   '[descriptors] Remove {}'.format(
                                       str(ieml))):
                db.remove_descriptor(ieml)
            return True
        # to_add = {l: [e for e in value[l] if e not in old_trans[l]] for l in LANGUAGES}
        # to_remove = {l: [e for e in old_trans[l] if e not in value[l]] for l in LANGUAGES}

        with self.gitdb.commit(
                self.signature, '[descriptors] Update {} for {} to {}'.format(
                    descriptor, str(ieml), json.dumps(value))):
            db.remove_descriptor(ieml, None, descriptor)

            for l in LANGUAGES:
                for e in value[l]:
                    db.add_descriptor(ieml, l, descriptor, e)

            return True
Esempio n. 6
0
    def update_all_ieml(self, f, message: str):
        db = IEMLDatabase(folder=self.gitdb.folder,
                          use_cache=self.use_cache,
                          cache_folder=self.cache_folder)
        desc = db.get_descriptors()

        with self.gitdb.commit(
                self.signature,
                '[IEML migration] Update all ieml in db: {}'.format(message)):

            for old_ieml in tqdm.tqdm(db.list(parse=True), "Migrate all usls"):
                new_ieml = f(old_ieml)

                value = desc.get_values_partial(old_ieml)

                db.remove_descriptor(old_ieml, None, None)

                for l in LANGUAGES:
                    for d in value[l]:
                        for e in value[l][e]:
                            db.add_descriptor(new_ieml, l, d, e)
Esempio n. 7
0
    def delete_morpheme_paradigm(self, script: Script):
        db = IEMLDatabase(folder=self.gitdb.folder,
                          use_cache=self.use_cache,
                          cache_folder=self.cache_folder)
        d = db.get_dictionary()
        descriptors = db.get_descriptors()

        script = _check_script(script)
        if script in d.scripts and len(script) == 1:
            raise ValueError("Script {} is not a paradigm".format(str(script)))

        root = d.tables.root(script)
        message = "[dictionary] Remove paradigm {} ({})"\
                          .format(str(script),
                                  " / ".join(
                                      "{}:{}".format(l, ', '.join(descriptors.get_values(script, l, 'translations'))) for l in LANGUAGES),
                                  str(root),
                                  " / ".join(
                                      "{}:{}".format(l, ', '.join(descriptors.get_values(root, l, 'translations'))) for l in LANGUAGES))

        with self.gitdb.commit(self.signature, message):
            db.remove_structure(script)
            db.remove_descriptor(script)
Esempio n. 8
0
gitdb = GitInterface(origin="https://github.com/plevyieml/ieml-language")
gitdb.pull()  # download database in ~/.cache/ieml/ folder

# instanciate a ieml.ieml_database.IEMLDatabase from the downloaded git repository
db = IEMLDatabase(folder=gitdb.folder)

# usls = db.list(parse=True, type='word')
#
# parsed_usls = list()
# for e in tqdm(usls):
#     parsed_usls.append(get_word_structure(e))
#
# with bz2.open(WORDS_FILENAME + ".bz2", "wt") as fout:
#     json.dump(parsed_usls, fout, indent=2)

descriptors = db.get_descriptors()

usls = db.list()

translations = list()
for e in tqdm(usls):
    assert (e not in translations)
    tr_dict = dict()
    values = descriptors.get_values_partial(e)
    for (usl, lang, label), tr_list in values.items():
        assert (usl == e)
        if label == "translations":
            assert (lang not in tr_dict)
            tr_dict[lang] = tr_list
    translations.append({"usl": e, "translations": tr_dict})
Esempio n. 9
0
    def create_root_paradigm(self, root, inhibitions, translations, comments):
        db = IEMLDatabase(folder=self.gitdb.folder,
                          use_cache=self.use_cache,
                          cache_folder=self.cache_folder)

        root = _check_script(root)
        if len(root) == 1:
            raise ValueError(
                "The script is not a paradigm {}, can't use it to define a root paradigm."
                .format(str(root)))

        translations = _check_descriptors(translations)
        comments = _check_descriptors(comments)

        # if not already exists (no descriptor no structures)
        if db.get_descriptors().get_values_partial(root):
            raise ValueError(
                "Script {} already exists in dictionary".format(root))

        dictionary = db.get_dictionary()
        for ss in root.singular_sequences:
            try:
                r = dictionary.tables.root(ss)
                raise ValueError(
                    "Root paradigms {} intersection with script {} ".format(
                        str(r), str(root)))
            except KeyError:
                pass

        with self.gitdb.commit(
                self.signature,
                "[dictionary] Create root paradigm {} ({}), create {} singular sequences"
                .format(
                    str(root), " / ".join("{}:{}".format(
                        l, ', '.join(db.get_descriptors().get_values(
                            str(root), l, 'translations')))
                                          for l in LANGUAGES),
                    len(root.singular_sequences)),
        ):

            db.remove_structure(root, 'is_root')
            db.add_structure(root, 'is_root', True)
            for i in _check_inhibitions(inhibitions):
                db.add_structure(root, 'inhibition', i)

            for l in LANGUAGES:
                for v in translations[l]:
                    db.add_descriptor(root,
                                      language=l,
                                      descriptor='translations',
                                      value=v)

                for v in comments[l]:
                    db.add_descriptor(root,
                                      language=l,
                                      descriptor='comments',
                                      value=v)

        # add main tables header
        for i, t in enumerate([tt for tt in root.tables_script if tt != root]):
            self.add_morpheme_paradigm(
                t,
                translations=append_idx_to_dict(translations, i),
                comments=append_idx_to_dict(comments, i))
Esempio n. 10
0
    def update_morpheme_paradigm(
        self,
        script_old: Script,
        script_new: Script,
    ):
        script_old = _check_script(script_old)
        script_new = _check_script(script_new)

        if script_old == script_new:
            return

        assert len(script_old) != 1 or len(
            script_new) != 1, "Can't update singular sequences, only paradigms"

        db = IEMLDatabase(folder=self.gitdb.folder,
                          use_cache=self.use_cache,
                          cache_folder=self.cache_folder)
        d = db.get_dictionary()
        desc = db.get_descriptors()
        ds = db.get_structure()

        assert script_old in d.scripts, "Source script not defined in dictionary"
        assert script_new not in d.scripts, "Target script already defined in dictionary"
        root_old = d.tables.root(script_old)
        is_root = ds.get_values(script_old, 'is_root')
        is_root = is_root and is_root[0][0].lower() == 't'

        root_new_cand = set()
        for ss in script_new.singular_sequences:
            try:
                root_new_cand.add(d.tables.root(ss))
            except KeyError:
                if not is_root:
                    raise ValueError(
                        "A non root paradigm is defined over singular sequences that are in no paradigms"
                    )

        assert len(
            root_new_cand
        ) == 1, "No root paradigms or too many for script {}".format(
            str(script_new))
        root_new = next(iter(root_new_cand))

        message = "[dictionary] Update paradigm IEML from {} to {}"\
                          .format(str(script_old),
                                  str(script_new),
                                  " / ".join(
                                      "{}:{}".format(l, desc.get_values(script_new, l, 'translations')) for l in LANGUAGES))

        if is_root:
            # 1st case: root paradigm

            assert script_old in script_new, "Can only update a root paradigm to a bigger version of it"

            # then we can update it to a bigger version of it
            old_structure = ds.get_values_partial(script_old)

        # transfers translations and structure
        with self.gitdb.commit(self.signature, message):

            if is_root:
                db.remove_structure(script_old)
                db.add_structure(script_old, 'is_root', 'False')

                for (_, key), values in old_structure.items():
                    for v in values:
                        db.add_structure(script_new, key, v)
            else:
                db.remove_structure(script_old)
                db.add_structure(script_new, 'is_root', 'False')

            db.remove_descriptor(script_old)

            for (_, l,
                 k), values in desc.get_values_partial(script_old).items():
                for v in values:
                    db.add_descriptor(script_new, l, k, v)
                    if is_root:
                        db.add_descriptor(
                            script_old, l, k,
                            '(translation migrated to {}) '.format(
                                str(script_new)) + v)
Esempio n. 11
0
    def test_add_remove(self):
        # clone two id repository
        gitA, gitB = init_repo(['/tmp/iemldb_test/A', '/tmp/iemldb_test/B'])
        # gitA = gitdbs[0]
        # gitB = gitdbs[1]

        print("Building DB...")
        dbA = IEMLDatabase(folder=gitA.folder)
        dbB = IEMLDatabase(folder=gitB.folder)

        # commit two differents values
        ieml = "(a.)"
        name = 'A'
        valueA = {
            'translations': {'fr': ['test' + name], 'en': ['test' + name]},
            'comments': {'fr': ['test' + name], 'en': ['test' + name]},
            'tags': {'fr': ['test' + name], 'en': ['test' + name]}
        }
        commit(gitA, dbA, ieml, 'A', valueA)

        commit(gitB, dbB, ieml, 'B', valueA)
        valueB = {
            'translations': {'fr': [], 'en': []},
            'comments': {'fr': [], 'en': []},
            'tags': {'fr': [], 'en': []}
        }

        # set B as a remote for A
        gitA.add_remote('B', os.path.join(gitB.folder, '.git'))
        gitB.add_remote('A', os.path.join(gitA.folder, '.git'))

        print("Pulling A from B")
        conflicts = gitA.pull('B')

        print(conflicts)
        dbA = IEMLDatabase(folder=gitA.folder)

        descA = dbA.get_descriptors()
        self.assertDictEqual(descA.get_descriptor(ieml), valueA)
        self.assertDictEqual(conflicts, {})

        # modify ieml in A, and remove it in B
        valueA = {
            'translations': {'fr': ['test2' + name], 'en': ['test' + name]},
            'comments': {'fr': ['test2' + name], 'en': ['test' + name]},
            'tags': {'fr': ['test2' + name], 'en': ['test' + name]}
        }
        commit(gitA, dbA, ieml, 'A', valueA)
        commit(gitB, dbB, ieml, 'B', valueB)

        print("Pulling A from B")
        conflicts = gitA.pull('B')

        print(conflicts)
        dbA = IEMLDatabase(folder=gitA.folder)

        descA = dbA.get_descriptors()
        self.assertDictEqual(descA.get_descriptor(ieml), valueB)
        self.assertDictEqual(conflicts[ieml], valueA)

        print("Pulling B from A")
        conflicts = gitB.pull('A')

        print(conflicts)
        dbB = IEMLDatabase(folder=gitB.folder)

        descB = dbB.get_descriptors()
        self.assertDictEqual(descB.get_descriptor(ieml), valueB)
        self.assertDictEqual(conflicts, {})
Esempio n. 12
0
        doc.packages.append(Package('xcolor', ['dvipsnames', 'table']))
        try:
            doc.generate_pdf(clean_tex=False, silent=False)
            doc.generate_tex()
        except subprocess.CalledProcessError as e:
            os.chdir(
                old_cwd)  # because pylatex change it but doesnt restore it
            raise e

        with open(path + '.pdf', 'rb') as fp:
            return fp.read()


def rendex_latex_word(w: Word, descriptors: Descriptors, language: LANGUAGES):
    return compile_latex(word_to_latex(w, descriptors, language))


if __name__ == "__main__":

    gitdb = GitInterface()
    db = IEMLDatabase(gitdb.folder)

    ieml = "[E:T:. (E:.b.wa.- E:.-wa.-t.o.-' E:.-'we.-S:.-'t.o.-',)(e.) > E:.n.- (E:.wo.- E:S:.-d.u.-') > E:.d.- (E:.wo.- E:S:.-d.u.-')(m.-S:.U:.-') > ! E:.n.- E:U:. ()]"

    w = usl(ieml)
    res = rendex_latex_word(w, db.get_descriptors(), 'en')

    with open("output.pdf", 'wb') as fp:
        fp.write(res)
class IemlData:
    """
    Modify the ieml data to output a vector representation
    """
    def __init__(self, input_database_folder_path, out_file_path=None):
        from ieml.ieml_database import IEMLDatabase
        # input file
        self.database = IEMLDatabase(folder=input_database_folder_path)
        # output file
        if out_file_path is not None:
            with open(out_file_path, "w") as output_file:
                output_file.write("")
            self.vocab_file_path = "{0}.vocab".format(
                out_file_path.replace(".tsv", "").replace(".csv", ""))
            self.vocab_file = open(self.vocab_file_path, "a")
            self.out_file_path = out_file_path
            self.out_file = open(out_file_path, "a")

    def close_all(self):
        try:
            self.vocab_file.close()
        except AttributeError:
            pass
        try:
            self.out_file.close()
        except AttributeError:
            pass

    def get_word_objects(self):
        return self.database.list(parse=False, type='word')

    def list_polymorpheme_of_word(self, w):
        ########WORkAROUND############TO BE SOLVED THEN REMOVE########################################3
        if w == "[! E:B:. ()(k.a.-k.a.-' l.o.-k.o.-') > E:.f.- ()(p.E:A:T:.-)] [>role>E:B:.>content>constant>k.a.-k.a.-'":
            return []
        ##################################################################
        w = usl(w)
        assert isinstance(w, Word)
        polyList = []
        for sfun in w.syntagmatic_fun.actors.values():
            if sfun.actor is not None:
                polyList.append((sfun.actor.pm_content, sfun.actor.pm_flexion))
        # return list(chain.from_iterable((sfun.actor.pm_content, sfun.actor.pm_flexion)
        #                                 for sfun in w.syntagmatic_fun.actors.values())) # encounteres AttributeError: 'NoneType' object has no attribute 'pm_content' since sfun.actor can be None
        return polyList

    def get_natural_lang_meanings(self, lang="en"):
        nl_meanings = []
        descriptors = self.database.get_descriptors()
        for word in self.get_word_objects():
            word_nl_meanings = []
            # get meaning of word
            desc_w_vals = descriptors.get_values_partial(word)
            for (usl_w, language_w, label_w), tr_w_list in desc_w_vals.items():
                if language_w == lang and label_w == "translations":
                    word_nl_meanings.append([" , ".join(tr_w_list)])
            # divide the words form the polymorphemes
            word_nl_meanings.append([" : "])
            # get meaning of polymorpheme
            polymorphemes = self.list_polymorpheme_of_word(word)
            for polymorph in polymorphemes:
                for poly in polymorph:
                    desc_p_vals = descriptors.get_values_partial(poly)
                    for (usl_p, language_p,
                         label_p), tr_p_list in desc_p_vals.items():
                        if language_p == lang and label_p == "translations":
                            word_nl_meanings.append(tr_p_list)
            nl_meanings.append(word_nl_meanings)
        return nl_meanings

    def get_bert_emb(self, string, bert_class):
        bert_class = bert_class if bert_class is not None else BertEmbedd()
        return bert_class.bert([string])

    def make_bert_emb_list(self, lang="en", bert_class=None, dump=False):
        bert_class = bert_class if bert_class is not None else BertEmbedd()
        bert_embeddings = []
        for ieml_pm_in_nl in self.get_natural_lang_meanings(lang):
            ieml_w_pm_sent = " ".join(
                [" ".join(pm) for pm in ieml_pm_in_nl if len(pm) != 0])
            # yield self.get_bert_emb(ieml_pm_in_nl, bert_class)
            bert_embeddings.append(
                self.get_bert_emb(ieml_w_pm_sent, bert_class))
        # dump ieml the sentence embeddings
        if dump is not False:
            # dump the embeddings
            for bert_emb in bert_embeddings:
                for (bert_vocab, bert_vect) in bert_emb:
                    bert_vect = np.array(bert_vect)
                    self.vocab_file.write("{0}\n".format(
                        json.dumps(bert_vocab)))
                    try:
                        self.out_file.write("{0}\n".format(
                            json.dumps(bert_vect)))
                    except TypeError:
                        self.out_file.write("{0}\n".format(bert_vect.dumps()))
            # numpy.save(self.out_file, sent_emb)
        self.close_all()
        return bert_embeddings