コード例 #1
0
    def process(self, lexicon: Lexicon):
        """Build Latin WFL to DeriNet format."""
        def parse_lemmas(l_lem, l_pos):
            parse_pos = {
                'A': 'ADJ',
                'N': 'NOUN',
                'V': 'VERB',
                'I': 'X',
                'P': 'PRON',
                'U': 'AUX'
            }
            gend_parse = {'m': 'Masc', 'f': 'Fem', 'n': 'Neut'}
            pos, gend, _, wid = l_pos.split('_')

            feat = {}
            if pos[0] == 'N':
                if gend in ('m', 'f', 'n'):
                    feat['Gender'] = gend_parse[gend]
                if len(pos) > 1:
                    feat['Declension'] = pos[1]
            elif pos[0] == 'A' and len(pos) > 1:
                feat['AdjClass'] = pos[1]
            elif pos[0] == 'V':
                if len(pos) <= 1:
                    pass
                elif pos[1] in ('1', '2', '3', '4', '5'):
                    feat['Conjugation'] = pos[1]
                elif pos[1] == 'A':
                    pos = 'U'

            lid = l_lem + '#' + parse_pos[pos[0]] + '#' + wid
            return parse_pos[pos[0]], feat, lid

        # load data
        harm = pickle.load(open(self.fname, 'rb'))

        # add lemmas, morphological features and segmentation
        for entry in harm:
            pos, feat, lid = parse_lemmas(entry['form'], entry['pos'])

            # check presence in the lexicon (due to compounds)
            present = lexicon.get_lexemes(lemma=entry['form'],
                                          pos=pos,
                                          lemid=lid)

            if len(present) == 0:
                lexicon.create_lexeme(lemma=entry['form'],
                                      pos=pos,
                                      feats=feat,
                                      lemid=lid)

        # add main relations and used afix,
        # add other derivational relations and used afix,
        # add references to splitted families,
        # add compounding
        for entry in harm:
            c_pos, _, c_lid = parse_lemmas(entry['form'], entry['pos'])
            chi_node = lexicon.get_lexemes(lemma=entry['form'],
                                           pos=c_pos,
                                           lemid=c_lid)[0]

            if entry['parent']:
                parse = entry['parent'][0][0].split('_')
                p_form = parse[0]
                p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:]))

                par_node = lexicon.get_lexemes(lemma=p_form,
                                               pos=p_pos,
                                               lemid=p_lid)[0]

                afix = entry['parent'][1][3]
                typ = entry['parent'][1][2].replace('Derivation_', '')
                if typ in ('Prefix', 'Suffix'):
                    lexicon.add_derivation(source=par_node, target=chi_node)
                    chi_node.parent_relation.feats[typ] = afix
                elif typ == 'Conversion':
                    lexicon.add_conversion(source=par_node, target=chi_node)

            if entry['others']:  # TODO: change place to 9th colummn;conversion
                parents = list()
                for other in entry['others']:
                    parse = other[0][0].split('_')
                    p_form = parse[0]
                    p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:]))

                    afix = other[1][3]
                    typ = other[1][2].replace('Derivation_', '')

                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos,
                                                   lemid=p_lid)[0]

                    rl_par = chi_node.parent_relation
                    if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \
                       or not rl_par:
                        if typ in ('Prefix', 'Suffix'):
                            p = par_node.lemid + '&' + typ + '=' + afix
                            p += '&Type=Derivation'
                            parents.append(p)
                        else:
                            parents.append(par_node.lemid + '&Type=' + typ)

                if parents:
                    chi_node.misc['other_parents'] = '|'.join(parents)

            if entry['ref_roots']:
                roots = list()
                for ref in entry['ref_roots']:
                    parse = ref.split('_')
                    p_form = parse[0]
                    p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:]))

                    par_node = lexicon.get_lexemes(lemma=p_form,
                                                   pos=p_pos,
                                                   lemid=p_lid)[0]

                    if par_node.lemid != chi_node.lemid:
                        roots.append(par_node.lemid)

                if roots:
                    chi_node.misc['was_in_family_with'] = '&'.join(roots)

            if entry['compounding']:
                p1_parse = entry['compounding'][0][0].split('_')
                p1_form = p1_parse[0]
                p1_attr = '_'.join(p1_parse[1:])
                p1_pos, p1_feat, p1_lid = parse_lemmas(p1_form, p1_attr)

                p1_node = lexicon.get_lexemes(lemma=p1_form,
                                              pos=p1_pos,
                                              lemid=p1_lid)
                if len(p1_node) == 0:
                    lexicon.create_lexeme(lemma=p1_form,
                                          pos=p1_pos,
                                          feats=p1_feat,
                                          lemid=p1_lid)

                p1_node = lexicon.get_lexemes(lemma=p1_form,
                                              pos=p1_pos,
                                              lemid=p1_lid)[0]

                p2_parse = entry['compounding'][1][0].split('_')
                p2_form = p2_parse[0]
                p2_attr = '_'.join(p2_parse[1:])
                p2_pos, p2_feat, p2_lid = parse_lemmas(p2_form, p2_attr)
                p2_node = lexicon.get_lexemes(lemma=p2_form,
                                              pos=p2_pos,
                                              lemid=p2_lid)
                if len(p2_node) == 0:
                    lexicon.create_lexeme(lemma=p2_form,
                                          pos=p2_pos,
                                          feats=p2_feat,
                                          lemid=p2_lid)

                p2_node = lexicon.get_lexemes(lemma=p2_form,
                                              pos=p2_pos,
                                              lemid=p2_lid)[0]

                if p1_node == p2_node or not p1_node or not p2_node:
                    continue
                lexicon.add_composition([p1_node, p2_node], p1_node, chi_node)

        return lexicon
コード例 #2
0
    def process(self, lexicon: Lexicon):
        """Build GCelex to DeriNet format."""
        # load data
        harm = pickle.load(open(self.fname, 'rb'))
        parse_pos = {
            'N': 'NOUN',
            'V': 'VERB',
            'A': 'ADJ',
            'D': 'ADV',
            'X': 'X',
            'C': 'NUM',
            'P': 'ADP'
        }

        # add lemmas and morphological features
        for entry in harm:
            oid, form = entry['form'].split('_')
            lid = form + '#' + parse_pos[entry['pos']] + '#' + oid
            lexicon.create_lexeme(lemma=form,
                                  pos=parse_pos[entry['pos']],
                                  lemid=lid)

        # add main relations,
        # add original features,
        # add compounds
        for entry in harm:
            c_pos = parse_pos[entry['pos']]
            oid, form = entry['form'].split('_')
            c_lid = form + '#' + c_pos + '#' + oid
            chi_node = lexicon.get_lexemes(lemma=form, pos=c_pos,
                                           lemid=c_lid)[0]

            if entry['parent']:
                p_oid, p_form, p_pos = entry['parent'][0].split('_')
                p_pos = parse_pos[p_pos]
                p_lid = p_form + '#' + p_pos + '#' + p_oid

                par_node = lexicon.get_lexemes(lemma=p_form,
                                               pos=p_pos,
                                               lemid=p_lid)[0]
                lexicon.add_derivation(source=par_node, target=chi_node)

            # features
            orig = entry['orig'].split('#')
            if len(orig) > 0 and orig != ['']:
                orig_hierarch = orig[0]
                chi_node.misc['segmentation_hierarch'] = orig_hierarch
            if len(orig) > 1:
                orig_flat = orig[1]
                chi_node.misc['segmentation'] = orig_flat
            if len(orig) > 2:
                orig_morphs = orig[2]
                chi_node.misc['morpheme_order'] = orig_morphs

            # compounds
            if entry['compounding']:
                # parent 1
                p1_oid, p1_form, p1_pos = entry['compounding'][0][0].split('_')
                p1_pos = parse_pos[p1_pos]
                p1_lid = '#'.join([p1_oid, p1_form, p1_pos])
                p1_node = lexicon.get_lexemes(lemma=p1_form,
                                              pos=p1_pos,
                                              lemid=p1_lid)
                if len(p1_node) == 0:
                    lexicon.create_lexeme(lemma=p1_form,
                                          pos=p1_pos,
                                          lemid=p1_lid)
                    # features
                    p1_node = lexicon.get_lexemes(lemma=p1_form,
                                                  pos=p1_pos,
                                                  lemid=p1_lid)[0]
                    orig = entry['compounding'][0][1].split('#')
                    if len(orig) > 0 and orig != ['']:
                        orig_hierarch = orig[0]
                        p1_node.misc['segmentation_hierarch'] = orig_hierarch
                    if len(orig) > 1:
                        orig_flat = orig[1]
                        p1_node.misc['segmentation'] = orig_flat
                    if len(orig) > 2:
                        orig_morphs = orig[2]
                        p1_node.misc['morpheme_order'] = orig_morphs

                p1_node = lexicon.get_lexemes(lemma=p1_form,
                                              pos=p1_pos,
                                              lemid=p1_lid)[0]

                # parent 2
                p2_oid, p2_form, p2_pos = entry['compounding'][1][0].split('_')
                p2_pos = parse_pos[p2_pos]
                p2_lid = '#'.join([p2_oid, p2_form, p2_pos])
                p2_node = lexicon.get_lexemes(lemma=p2_form,
                                              pos=p2_pos,
                                              lemid=p2_lid)
                if len(p2_node) == 0:
                    lexicon.create_lexeme(lemma=p2_form,
                                          pos=p2_pos,
                                          lemid=p2_lid)
                    # features
                    p2_node = lexicon.get_lexemes(lemma=p1_form,
                                                  pos=p1_pos,
                                                  lemid=p1_lid)[0]
                    orig = entry['compounding'][1][1].split('#')
                    if len(orig) > 0 and orig != ['']:
                        orig_hierarch = orig[0]
                        p2_node.misc['segmentation_hierarch'] = orig_hierarch
                    if len(orig) > 1:
                        orig_flat = orig[1]
                        p2_node.misc['segmentation'] = orig_flat
                    if len(orig) > 2:
                        orig_morphs = orig[2]
                        p2_node.misc['morpheme_order'] = orig_morphs

                p2_node = lexicon.get_lexemes(lemma=p2_form,
                                              pos=p2_pos,
                                              lemid=p2_lid)[0]

                if p1_node == p2_node or not p1_node or not p2_node:
                    continue
                lexicon.add_composition([p1_node, p2_node], p1_node, chi_node)

        return lexicon