コード例 #1
0
    def process(self, lexicon: Lexicon):
        """Build Princeton WordNet to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))
        parse_pos = {'V': 'VERB', 'N': 'NOUN'}

        # add lemmas and morphological features
        for entry in harm:
            lexicon.create_lexeme(lemma=entry['form'],
                                  pos=parse_pos[entry['pos']])

        # add main derivational relations and semantic labels,
        # add other derivational relations and semantic labels,
        # add references to splitted families
        for entry in harm:
            c_pos = parse_pos[entry['pos']]
            chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos)[0]

            if entry['parent']:
                p_form, p_pos = entry['parent'][0][0].split('_')
                p_pos = parse_pos[p_pos]
                par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos)[0]

                lexicon.add_derivation(source=par_node, target=chi_node)

                label = entry['parent'][1].capitalize()
                chi_node.parent_relation.feats['SemanticLabel'] = label

            if entry['others']:  # TODO: change place to 9th colummn
                parents = list()
                for other in entry['others']:
                    p_form, p_pos = other[0][0].split('_')
                    label = other[1].capitalize()
                    p_pos = parse_pos[p_pos]
                    par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos)[0]

                    rl_par = chi_node.parent_relation
                    if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \
                       or not rl_par:
                        p = par_node.lemid + '&SemanticLabel=' + label
                        p += '&Type=Derivation'
                        parents.append(p)

                if parents:
                    chi_node.misc['other_parents'] = '|'.join(parents)

            if entry['ref_roots']:
                roots = list()
                for ref in entry['ref_roots']:
                    p_form, p_pos = ref.split('_')
                    p_pos = parse_pos[p_pos]
                    par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos)[0]
                    if par_node.lemid != chi_node.lemid:
                        roots.append(par_node.lemid)

                if roots:
                    chi_node.misc['was_in_family_with'] = '&'.join(roots)

        return lexicon
コード例 #2
0
    def process(self, lexicon: Lexicon):
        """Build DerivBaseHR to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))
        parse_pos = {'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ'}

        # add lemmas and morphological features
        for entry in harm:
            lid = entry['form'] + '#' + parse_pos[entry['pos']]
            lexicon.create_lexeme(lemma=entry['form'],
                                  pos=parse_pos[entry['pos']],
                                  lemid=lid)

        # add main relations and rules,
        # add other derivational relations and rules,
        # add references to splitted families
        for entry in harm:
            c_pos = parse_pos[entry['pos']]
            c_lid = entry['form'] + '#' + c_pos
            chi_node = lexicon.get_lexemes(lemma=entry['form'],
                                           pos=c_pos,
                                           lemid=c_lid)[0]

            if entry['parent']:
                p_form, p_pos = entry['parent'][0].split('_')
                p_pos = parse_pos[p_pos]
                p_lid = p_form + '#' + p_pos
                par_node = lexicon.get_lexemes(lemma=p_form,
                                               pos=p_pos,
                                               lemid=p_lid)[0]

                lexicon.add_derivation(source=par_node, target=chi_node)

            if entry['ref_roots']:
                roots = list()
                for ref in entry['ref_roots']:
                    p_form, p_pos = ref.split('_')
                    p_pos = parse_pos[p_pos]
                    p_lid = p_form + '#' + p_pos
                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos,
                                                   lemid=p_lid)[0]

                    if par_node.lemid != chi_node.lemid:
                        roots.append(par_node.lemid)

                if roots:
                    chi_node.misc['was_in_family_with'] = '&'.join(roots)

        return lexicon
コード例 #3
0
    def process(self, lexicon: Lexicon):
        """Build Serbo-Croatian Etymological WordNet to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))

        # add lemmas and morphological features
        for entry in harm:
            lexicon.create_lexeme(lemma=entry['form'], pos='')

        # add main derivational relations,
        # add other derivational relations,
        # add references to splitted families
        for entry in harm:
            chi_node = lexicon.get_lexemes(lemma=entry['form'])[0]

            if entry['parent']:
                par_node = lexicon.get_lexemes(lemma=entry['parent'][0])[0]
                lexicon.add_derivation(source=par_node, target=chi_node)

            if entry['others']:  # TODO: change place to 9th colummn
                parents = list()
                for other in entry['others']:
                    par_node = lexicon.get_lexemes(lemma=other[0])[0]

                    rl_par = chi_node.parent_relation
                    if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \
                       or not rl_par:
                        parents.append(par_node.lemid + '&Type=Derivation')

                if parents:
                    chi_node.misc['other_parents'] = '|'.join(parents)

            if entry['ref_roots']:
                roots = list()
                for ref in entry['ref_roots']:
                    par_node = lexicon.get_lexemes(lemma=ref)[0]
                    if par_node.lemid != chi_node.lemid:
                        roots.append(par_node.lemid)

                if roots:
                    chi_node.misc['was_in_family_with'] = '&'.join(roots)

        return lexicon
コード例 #4
0
    def process(self, lexicon: Lexicon):
        """Build DerIvaTario to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))
        parse_pos = {
            'V': 'VERB',
            'N': 'NOUN',
            'A': 'ADJ',
            'D': 'ADV',
            'E': 'X',
            'X': 'X'
        }

        # add lemmas and morphological features
        for entry in harm:
            lid = entry['form'].lower() + '#' + parse_pos[entry['pos']]
            lexicon.create_lexeme(lemma=entry['form'].lower(),
                                  pos=parse_pos[entry['pos']],
                                  lemid=lid)

        # add main relations,
        # add original features
        for entry in harm:
            c_pos = parse_pos[entry['pos']]
            c_lid = entry['form'].lower() + '#' + c_pos
            chi_node = lexicon.get_lexemes(lemma=entry['form'].lower(),
                                           pos=c_pos,
                                           lemid=c_lid)[0]

            if entry['parent']:
                p_form, p_pos = entry['parent'][0].split('_')
                p_pos = parse_pos[p_pos]
                p_lid = p_form.lower() + '#' + p_pos

                par_node = lexicon.get_lexemes(lemma=p_form.lower(),
                                               pos=p_pos,
                                               lemid=p_lid)[0]
                lexicon.add_derivation(source=par_node, target=chi_node)

            orig = entry['orig'].split(';')
            orig_id = int(orig[0])
            orig_sg = [i for i in orig[2:-1] if i]
            chi_node.misc['original_id'] = orig_id
            chi_node.misc['segmentation'] = orig_sg

        return lexicon
コード例 #5
0
    def process(self, lexicon: Lexicon):
        """Build Latin WFL to DeriNet format."""
        def parse_lemmas(l_lem, l_pos):
            parse_pos = {
                'A': 'ADJ',
                'N': 'NOUN',
                'V': 'VERB',
                'I': 'X',
                'P': 'PRON',
                'U': 'AUX'
            }
            gend_parse = {'m': 'Masc', 'f': 'Fem', 'n': 'Neut'}
            pos, gend, _, wid = l_pos.split('_')

            feat = {}
            if pos[0] == 'N':
                if gend in ('m', 'f', 'n'):
                    feat['Gender'] = gend_parse[gend]
                if len(pos) > 1:
                    feat['Declension'] = pos[1]
            elif pos[0] == 'A' and len(pos) > 1:
                feat['AdjClass'] = pos[1]
            elif pos[0] == 'V':
                if len(pos) <= 1:
                    pass
                elif pos[1] in ('1', '2', '3', '4', '5'):
                    feat['Conjugation'] = pos[1]
                elif pos[1] == 'A':
                    pos = 'U'

            lid = l_lem + '#' + parse_pos[pos[0]] + '#' + wid
            return parse_pos[pos[0]], feat, lid

        # load data
        harm = pickle.load(open(self.fname, 'rb'))

        # add lemmas, morphological features and segmentation
        for entry in harm:
            pos, feat, lid = parse_lemmas(entry['form'], entry['pos'])

            # check presence in the lexicon (due to compounds)
            present = lexicon.get_lexemes(lemma=entry['form'],
                                          pos=pos,
                                          lemid=lid)

            if len(present) == 0:
                lexicon.create_lexeme(lemma=entry['form'],
                                      pos=pos,
                                      feats=feat,
                                      lemid=lid)

        # add main relations and used afix,
        # add other derivational relations and used afix,
        # add references to splitted families,
        # add compounding
        for entry in harm:
            c_pos, _, c_lid = parse_lemmas(entry['form'], entry['pos'])
            chi_node = lexicon.get_lexemes(lemma=entry['form'],
                                           pos=c_pos,
                                           lemid=c_lid)[0]

            if entry['parent']:
                parse = entry['parent'][0][0].split('_')
                p_form = parse[0]
                p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:]))

                par_node = lexicon.get_lexemes(lemma=p_form,
                                               pos=p_pos,
                                               lemid=p_lid)[0]

                afix = entry['parent'][1][3]
                typ = entry['parent'][1][2].replace('Derivation_', '')
                if typ in ('Prefix', 'Suffix'):
                    lexicon.add_derivation(source=par_node, target=chi_node)
                    chi_node.parent_relation.feats[typ] = afix
                elif typ == 'Conversion':
                    lexicon.add_conversion(source=par_node, target=chi_node)

            if entry['others']:  # TODO: change place to 9th colummn;conversion
                parents = list()
                for other in entry['others']:
                    parse = other[0][0].split('_')
                    p_form = parse[0]
                    p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:]))

                    afix = other[1][3]
                    typ = other[1][2].replace('Derivation_', '')

                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos,
                                                   lemid=p_lid)[0]

                    rl_par = chi_node.parent_relation
                    if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \
                       or not rl_par:
                        if typ in ('Prefix', 'Suffix'):
                            p = par_node.lemid + '&' + typ + '=' + afix
                            p += '&Type=Derivation'
                            parents.append(p)
                        else:
                            parents.append(par_node.lemid + '&Type=' + typ)

                if parents:
                    chi_node.misc['other_parents'] = '|'.join(parents)

            if entry['ref_roots']:
                roots = list()
                for ref in entry['ref_roots']:
                    parse = ref.split('_')
                    p_form = parse[0]
                    p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:]))

                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos,
                                                   lemid=p_lid)[0]

                    if par_node.lemid != chi_node.lemid:
                        roots.append(par_node.lemid)

                if roots:
                    chi_node.misc['was_in_family_with'] = '&'.join(roots)

            if entry['compounding']:
                p1_parse = entry['compounding'][0][0].split('_')
                p1_form = p1_parse[0]
                p1_attr = '_'.join(p1_parse[1:])
                p1_pos, p1_feat, p1_lid = parse_lemmas(p1_form, p1_attr)

                p1_node = lexicon.get_lexemes(lemma=p1_form,
                                              pos=p1_pos,
                                              lemid=p1_lid)
                if len(p1_node) == 0:
                    lexicon.create_lexeme(lemma=p1_form,
                                          pos=p1_pos,
                                          feats=p1_feat,
                                          lemid=p1_lid)

                p1_node = lexicon.get_lexemes(lemma=p1_form,
                                              pos=p1_pos,
                                              lemid=p1_lid)[0]

                p2_parse = entry['compounding'][1][0].split('_')
                p2_form = p2_parse[0]
                p2_attr = '_'.join(p2_parse[1:])
                p2_pos, p2_feat, p2_lid = parse_lemmas(p2_form, p2_attr)
                p2_node = lexicon.get_lexemes(lemma=p2_form,
                                              pos=p2_pos,
                                              lemid=p2_lid)
                if len(p2_node) == 0:
                    lexicon.create_lexeme(lemma=p2_form,
                                          pos=p2_pos,
                                          feats=p2_feat,
                                          lemid=p2_lid)

                p2_node = lexicon.get_lexemes(lemma=p2_form,
                                              pos=p2_pos,
                                              lemid=p2_lid)[0]

                if p1_node == p2_node or not p1_node or not p2_node:
                    continue
                lexicon.add_composition([p1_node, p2_node], p1_node, chi_node)

        return lexicon
コード例 #6
0
    def process(self, lexicon: Lexicon):
        """Build DerivBase to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))
        parse_pos = {
            'V': ('VERB', None),
            'Nn': ('NOUN', 'Gender:Neut'),
            'Nf': ('NOUN', 'Gender:Fem'),
            'Nm': ('NOUN', 'Gender:Masc'),
            'N': ('NOUN', None),
            'A': ('ADJ', None)
        }

        # add lemmas and morphological features
        for entry in harm:
            feat = {}
            if parse_pos[entry['pos']][1] is not None:
                for f in parse_pos[entry['pos']][1:]:
                    key, value = f.split(':')
                    feat[key] = value

            lid = entry['form'] + '#' + parse_pos[entry['pos']][0]
            if parse_pos[entry['pos']][1]:
                lid += '#' + parse_pos[entry['pos']][1].replace('Gender:', '')

            lexicon.create_lexeme(lemma=entry['form'],
                                  pos=parse_pos[entry['pos']][0],
                                  feats=feat,
                                  lemid=lid)

        # add main relations and rules,
        # add other derivational relations and rules,
        # add references to splitted families
        for entry in harm:
            c_pos = parse_pos[entry['pos']]
            c_lid = entry['form'] + '#' + c_pos[0]
            if c_pos[1]:
                c_lid += '#' + c_pos[1].replace('Gender:', '')

            chi_node = lexicon.get_lexemes(lemma=entry['form'],
                                           pos=c_pos[0],
                                           lemid=c_lid)[0]

            if entry['parent']:
                p_form, p_pos = entry['parent'][0][0].split('_')
                p_pos = parse_pos[p_pos]
                p_lid = p_form + '#' + p_pos[0]
                if p_pos[1]:
                    p_lid += '#' + p_pos[1].replace('Gender:', '')

                par_node = lexicon.get_lexemes(lemma=p_form,
                                               pos=p_pos[0],
                                               lemid=p_lid)[0]

                lexicon.add_derivation(source=par_node, target=chi_node)

                rule = entry['parent'][1].replace('>', '')
                chi_node.parent_relation.feats['Rule'] = rule

            if entry['others']:  # TODO: change place to 9th colummn
                parents = list()
                for other in entry['others']:
                    p_form, p_pos = other[0][0].split('_')
                    rule = other[1].replace('>', '')
                    p_pos = parse_pos[p_pos]
                    p_lid = p_form + '#' + p_pos[0]
                    if p_pos[1]:
                        p_lid += '#' + p_pos[1].replace('Gender:', '')

                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos[0],
                                                   lemid=p_lid)[0]

                    rl_par = chi_node.parent_relation
                    if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \
                       or not rl_par:
                        p = par_node.lemid + '&Rule=' + rule
                        p += '&Type=Derivation'
                        parents.append(p)

                if parents:
                    chi_node.misc['other_parents'] = '|'.join(parents)

            if entry['ref_roots']:
                roots = list()
                for ref in entry['ref_roots']:
                    p_form, p_pos = ref.split('_')
                    p_pos = parse_pos[p_pos]
                    p_lid = p_form + '#' + p_pos[0]
                    if p_pos[1]:
                        p_lid += '#' + p_pos[1].replace('Gender:', '')

                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos[0],
                                                   lemid=p_lid)[0]

                    if par_node.lemid != chi_node.lemid:
                        roots.append(par_node.lemid)

                if roots:
                    chi_node.misc['was_in_family_with'] = '&'.join(roots)

        return lexicon
コード例 #7
0
    def process(self, lexicon: Lexicon):
        """Build GCelex to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))
        parse_pos = {
            'N': 'NOUN',
            'V': 'VERB',
            'A': 'ADJ',
            'D': 'ADV',
            'X': 'X',
            'C': 'NUM',
            'P': 'ADP'
        }

        # add lemmas and morphological features
        for entry in harm:
            oid, form = entry['form'].split('_')
            lid = form + '#' + parse_pos[entry['pos']] + '#' + oid
            lexicon.create_lexeme(lemma=form,
                                  pos=parse_pos[entry['pos']],
                                  lemid=lid)

        # add main relations,
        # add original features,
        # add compounds
        for entry in harm:
            c_pos = parse_pos[entry['pos']]
            oid, form = entry['form'].split('_')
            c_lid = form + '#' + c_pos + '#' + oid
            chi_node = lexicon.get_lexemes(lemma=form, pos=c_pos,
                                           lemid=c_lid)[0]

            if entry['parent']:
                p_oid, p_form, p_pos = entry['parent'][0].split('_')
                p_pos = parse_pos[p_pos]
                p_lid = p_form + '#' + p_pos + '#' + p_oid

                par_node = lexicon.get_lexemes(lemma=p_form,
                                               pos=p_pos,
                                               lemid=p_lid)[0]
                lexicon.add_derivation(source=par_node, target=chi_node)

            # features
            orig = entry['orig'].split('#')
            if len(orig) > 0 and orig != ['']:
                orig_hierarch = orig[0]
                chi_node.misc['segmentation_hierarch'] = orig_hierarch
            if len(orig) > 1:
                orig_flat = orig[1]
                chi_node.misc['segmentation'] = orig_flat
            if len(orig) > 2:
                orig_morphs = orig[2]
                chi_node.misc['morpheme_order'] = orig_morphs

            # compounds
            if entry['compounding']:
                # parent 1
                p1_oid, p1_form, p1_pos = entry['compounding'][0][0].split('_')
                p1_pos = parse_pos[p1_pos]
                p1_lid = '#'.join([p1_oid, p1_form, p1_pos])
                p1_node = lexicon.get_lexemes(lemma=p1_form,
                                              pos=p1_pos,
                                              lemid=p1_lid)
                if len(p1_node) == 0:
                    lexicon.create_lexeme(lemma=p1_form,
                                          pos=p1_pos,
                                          lemid=p1_lid)
                    # features
                    p1_node = lexicon.get_lexemes(lemma=p1_form,
                                                  pos=p1_pos,
                                                  lemid=p1_lid)[0]
                    orig = entry['compounding'][0][1].split('#')
                    if len(orig) > 0 and orig != ['']:
                        orig_hierarch = orig[0]
                        p1_node.misc['segmentation_hierarch'] = orig_hierarch
                    if len(orig) > 1:
                        orig_flat = orig[1]
                        p1_node.misc['segmentation'] = orig_flat
                    if len(orig) > 2:
                        orig_morphs = orig[2]
                        p1_node.misc['morpheme_order'] = orig_morphs

                p1_node = lexicon.get_lexemes(lemma=p1_form,
                                              pos=p1_pos,
                                              lemid=p1_lid)[0]

                # parent 2
                p2_oid, p2_form, p2_pos = entry['compounding'][1][0].split('_')
                p2_pos = parse_pos[p2_pos]
                p2_lid = '#'.join([p2_oid, p2_form, p2_pos])
                p2_node = lexicon.get_lexemes(lemma=p2_form,
                                              pos=p2_pos,
                                              lemid=p2_lid)
                if len(p2_node) == 0:
                    lexicon.create_lexeme(lemma=p2_form,
                                          pos=p2_pos,
                                          lemid=p2_lid)
                    # features
                    p2_node = lexicon.get_lexemes(lemma=p1_form,
                                                  pos=p1_pos,
                                                  lemid=p1_lid)[0]
                    orig = entry['compounding'][1][1].split('#')
                    if len(orig) > 0 and orig != ['']:
                        orig_hierarch = orig[0]
                        p2_node.misc['segmentation_hierarch'] = orig_hierarch
                    if len(orig) > 1:
                        orig_flat = orig[1]
                        p2_node.misc['segmentation'] = orig_flat
                    if len(orig) > 2:
                        orig_morphs = orig[2]
                        p2_node.misc['morpheme_order'] = orig_morphs

                p2_node = lexicon.get_lexemes(lemma=p2_form,
                                              pos=p2_pos,
                                              lemid=p2_lid)[0]

                if p1_node == p2_node or not p1_node or not p2_node:
                    continue
                lexicon.add_composition([p1_node, p2_node], p1_node, chi_node)

        return lexicon
コード例 #8
0
    def process(self, lexicon: Lexicon):
        """Build Démonette to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))
        parse_pos = {'Vmn----': ('VERB', None),
                     'Ncms': ('NOUN', 'Gender:Masc', 'Number:Sing'),
                     'Ncmp': ('NOUN', 'Gender:Masc', 'Number:Plur'),
                     'Ncfs': ('NOUN', 'Gender:Fem', 'Number:Sing'),
                     'Ncfp': ('NOUN', 'Gender:Fem', 'Number:Plur'),
                     'Afpms': ('ADJ', 'Gender:Masc', 'Number:Sing',
                               'AdjType:Qualif', 'Degree:Pos')}

        # add lemmas, morphological features and segmentation
        for entry in harm:
            feat = {}
            if parse_pos[entry['pos']][1] is not None:
                for f in parse_pos[entry['pos']][1:]:
                    key, value = f.split(':')
                    feat[key] = value

            lid = entry['form'] + '#' + parse_pos[entry['pos']][0]
            if parse_pos[entry['pos']][0] == 'NOUN':
                lid += '#' + parse_pos[entry['pos']][1].replace('Gender:', '')

            lexicon.create_lexeme(lemma=entry['form'],
                                  pos=parse_pos[entry['pos']][0],
                                  feats=feat,
                                  lemid=lid)

            if entry['seg'] != {''}:
                c_pos = parse_pos[entry['pos']]
                c_lid = entry['form'] + '#' + parse_pos[entry['pos']][0]
                if c_pos[0] == 'NOUN':
                    c_lid += '#' + c_pos[1].replace('Gender:', '')
                chi_node = lexicon.get_lexemes(lemma=entry['form'],
                                               pos=c_pos[0],
                                               lemid=c_lid)[0]

                segmentations = tuple(entry['seg'])[0].split('#')
                seg = list()
                for s in segmentations:
                    _, afix, _ = s.split('|')
                    if afix not in seg:
                        seg.append(afix)
                chi_node.misc['suffix'] = '|'.join(seg)

        # add main relations and semantic labels,
        # add other derivational relations and semantic labels,
        # add references to splitted families
        paradigm_lexemes = set()
        for entry in harm:
            c_pos = parse_pos[entry['pos']]
            c_lid = entry['form'] + '#' + parse_pos[entry['pos']][0]
            if c_pos[0] == 'NOUN':
                c_lid += '#' + c_pos[1].replace('Gender:', '')

            chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos[0],
                                           lemid=c_lid)[0]

            if entry['parent']:
                p_form, p_pos = entry['parent'][0][0].split('_')
                p_pos = parse_pos[p_pos]
                p_lid = p_form + '#' + p_pos[0]
                if p_pos[0] == 'NOUN':
                    p_lid += '#' + p_pos[1].replace('Gender:', '')

                par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0],
                                               lemid=p_lid)[0]

                lexicon.add_derivation(source=par_node, target=chi_node)

                label = entry['parent'][1].replace('@', '').split('#')[1]
                label = label.replace('|', '+')
                chi_node.parent_relation.feats['SemanticLabel'] = label

            if entry['others']:  # TODO: change place to 9th colummn
                parents = list()
                for other in entry['others']:
                    p_form, p_pos = other[0][0].split('_')
                    p_pos = parse_pos[p_pos]
                    p_lid = p_form + '#' + p_pos[0]
                    if p_pos[0] == 'NOUN':
                        p_lid += '#' + p_pos[1].replace('Gender:', '')

                    par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0],
                                                   lemid=p_lid)[0]

                    label = other[1].replace('@', '').split('#')[1]
                    label = label.replace('|', '+')
                    rl_par = chi_node.parent_relation
                    if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \
                       or not rl_par:
                        p = par_node.lemid + '&SemanticLabel=' + label
                        p += '&Type=Derivation'
                        parents.append(p)

                if parents:
                    chi_node.misc['other_parents'] = '|'.join(parents)

            if entry['ref_roots']:
                roots = list()
                for ref in entry['ref_roots']:
                    p_form, p_pos = ref.split('_')
                    p_pos = parse_pos[p_pos]
                    p_lid = p_form + '#' + p_pos[0]
                    if p_pos[0] == 'NOUN':
                        p_lid += '#' + p_pos[1].replace('Gender:', '')

                    par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0],
                                                   lemid=p_lid)[0]

                    if par_node.lemid != chi_node.lemid:
                        roots.append(par_node.lemid)

                if roots:
                    chi_node.misc['was_in_family_with'] = '&'.join(roots)

            if entry['inparadigm'] != {''}:
                [paradigm_lexemes.add(i) for i in entry['inparadigm']]
                # paradigm = list()
                # for pdg in tuple(entry['inparadigm'])[0].split('|'):
                #     p_form, p_pos = pdg.split('_')
                #     p_pos = parse_pos[p_pos]
                #     p_lid = p_form + '#' + p_pos[0]
                #     if p_pos[0] == 'NOUN':
                #         p_lid += '#' + p_pos[1].replace('Gender:', '')

                #     par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0],
                #                                    lemid=p_lid)
                #     if par_node:
                #         paradigm.append(par_node[0].lemid)

                # chi_node.misc['in_subparadigm_with'] = '&'.join(paradigm)

        # add the rest of lexemes related without any direction
        for lemma in paradigm_lexemes:
            if not lemma:
                continue

            lemma, pos = lemma.split('_')

            feat = {}
            if parse_pos[pos][1] is not None:
                for f in parse_pos[pos][1:]:
                    key, value = f.split(':')
                    feat[key] = value

            lid = lemma + '#' + parse_pos[pos][0]
            if parse_pos[pos][0] == 'NOUN':
                lid += '#' + parse_pos[pos][1].replace('Gender:', '')

            lexemes = lexicon.get_lexemes(lemma=lemma, pos=parse_pos[pos][0])
            if len(lexemes) == 0:
                lexicon.create_lexeme(lemma=lemma,
                                      pos=parse_pos[pos][0],
                                      feats=feat,
                                      lemid=lid)

        return lexicon
コード例 #9
0
import sys
import argparse
from collections import defaultdict

sys.path.append(os.path.realpath('../../../../tools/data-api/derinet2/'))
from derinet import Lexicon

# set argparse
parser = argparse.ArgumentParser()
parser.add_argument('--DeriNet', action='store', dest='csder', required=True)
parser.add_argument('--Cognates1', action='store', dest='cog1', required=True)
parser.add_argument('--Cognates2', action='store', dest='cog2', required=True)
par = parser.parse_args()

# load derinet
cs_derinet = Lexicon()
cs_derinet.load(par.csder)

# load list of cognates
cognates = defaultdict(bool)
for path in (par.cog1, par.cog2):
    with open(path, mode='r', encoding='U8') as f:
        for line in f:
            cognates[line.strip()] = True

# find families affected by cognates
affected_families = set()
for cognate in list(cognates):
    lexemes = cs_derinet.get_lexemes(lemma=cognate)
    if len(lexemes) == 0:
        continue
コード例 #10
0
ファイル: filter_marks.py プロジェクト: vidraj/derinet
parser.add_argument('--DeriNet', action='store', dest='csder', required=True)
parser.add_argument('--Loanwords', action='store', dest='loan', required=True)
parser.add_argument('--Output', action='store', dest='output', required=True)
par = parser.parse_args()


# load loanword marks
loanwords = OrderedDict()
with open(par.loan, mode='r', encoding='U8') as f:
    for line in f:
        lemma, tag, mark = line.rstrip('\n').split('\t')
        loanwords['_'.join([lemma, tag])] = bool(mark.replace('False', ''))


# load derinet
cs_derinet = Lexicon()
cs_derinet.load(par.csder)


# correct loanword marks
for lexeme, mark in loanwords.items():
    # find lexeme in derinet
    node = cs_derinet.get_lexemes(lemma=lexeme.split('_')[0],
                                  pos=lexeme.split('_')[1])[0]

    # propriums and their subtrees are FALSE
    if node.lemma[0].isupper():
        loanwords['_'.join([node.lemma, node.pos])] = False
        for node_child in node.iter_subtree():
            loanwords['_'.join([node_child.lemma, node_child.pos])] = False
        continue
コード例 #11
0
import argparse
from collections import defaultdict


sys.path.append(os.path.realpath('../../../../tools/data-api/derinet2/'))
from derinet import Lexicon


# set argparse
parser = argparse.ArgumentParser()
parser.add_argument('--DeriNet', action='store', dest='csder', required=True)
par = parser.parse_args()


# load derinet
cs_derinet = Lexicon()
cs_derinet.load(par.csder)

# find families of Foreign
foreign_lexemes = defaultdict()
affected_families = set()
for lexeme in cs_derinet.iter_lexemes():
    if lexeme.feats.get('Foreign', False):
        foreign_lexemes[lexeme.lemma] = True
        affected_families.add(lexeme.get_tree_root())


# print families of relevant cognates
for root in affected_families:
    for lexeme in root.iter_subtree():
        lexeme_mark = lexeme.feats.get('Loanword', False)
コード例 #12
0
    def process(self, lexicon: Lexicon):
        """Build DerivBaseRU to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))
        parse_pos = {
            'V': 'VERB',
            'N': 'NOUN',
            'D': 'ADV',
            'A': 'ADJ',
            'C': 'NUM'
        }

        # add lemmas and morphological features
        for entry in harm:
            lid = entry['form'] + '#' + parse_pos[entry['pos']]
            lexicon.create_lexeme(lemma=entry['form'],
                                  pos=parse_pos[entry['pos']],
                                  lemid=lid)

        # add main relations and rules,
        # add other derivational relations and rules,
        # add references to splitted families
        for entry in harm:
            c_pos = parse_pos[entry['pos']]
            c_lid = entry['form'] + '#' + c_pos
            chi_node = lexicon.get_lexemes(lemma=entry['form'],
                                           pos=c_pos,
                                           lemid=c_lid)[0]

            if entry['parent']:
                p_form, p_pos = entry['parent'][0][0].split('_')
                p_pos = parse_pos[p_pos]
                p_lid = p_form + '#' + p_pos
                par_node = lexicon.get_lexemes(lemma=p_form,
                                               pos=p_pos,
                                               lemid=p_lid)[0]

                lexicon.add_derivation(source=par_node, target=chi_node)

                # add rules
                rules, proc = list(), list()
                for item in entry['parent'][1].split('#'):
                    rul, pr = item.split('&')
                    rules.append(re.search(r'rule([0-9]*)', rul).group(1))
                    proc += pr.split(',')
                chi_node.parent_relation.feats['Rule'] = ','.join(rules)
                chi_node.parent_relation.feats['Process'] = ','.join(set(proc))

            if entry['others']:  # TODO: change place to 9th colummn
                parents = list()
                for other in entry['others']:
                    p_form, p_pos = other[0][0].split('_')
                    p_pos = parse_pos[p_pos]
                    p_lid = p_form + '#' + p_pos
                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos,
                                                   lemid=p_lid)[0]

                    rules, proc = list(), list()
                    for item in other[1].split('#'):
                        rul, pr = item.split('&')
                        rules.append(re.search(r'rule([0-9]*)', rul).group(1))
                        proc += pr.split(',')
                    rules = ','.join(rules)
                    proc = ','.join(set(proc))

                    rl_par = chi_node.parent_relation
                    if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \
                       or not rl_par:
                        p = par_node.lemid + '&Rule=' + rules
                        p += '&Process=' + proc + '&Type=Derivation'
                        parents.append(p)

                if parents:
                    chi_node.misc['other_parents'] = ','.join(parents)

            if entry['ref_roots']:
                roots = list()
                for ref in entry['ref_roots']:
                    p_form, p_pos = ref.split('_')
                    p_pos = parse_pos[p_pos]
                    p_lid = p_form + '#' + p_pos
                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos,
                                                   lemid=p_lid)[0]

                    if par_node.lemid != chi_node.lemid:
                        roots.append(par_node.lemid)

                if roots:
                    chi_node.misc['was_in_family_with'] = '&'.join(roots)

        return lexicon