def process(self, lexicon: Lexicon): """Build Princeton WordNet to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) parse_pos = {'V': 'VERB', 'N': 'NOUN'} # add lemmas and morphological features for entry in harm: lexicon.create_lexeme(lemma=entry['form'], pos=parse_pos[entry['pos']]) # add main derivational relations and semantic labels, # add other derivational relations and semantic labels, # add references to splitted families for entry in harm: c_pos = parse_pos[entry['pos']] chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos)[0] if entry['parent']: p_form, p_pos = entry['parent'][0][0].split('_') p_pos = parse_pos[p_pos] par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos)[0] lexicon.add_derivation(source=par_node, target=chi_node) label = entry['parent'][1].capitalize() chi_node.parent_relation.feats['SemanticLabel'] = label if entry['others']: # TODO: change place to 9th colummn parents = list() for other in entry['others']: p_form, p_pos = other[0][0].split('_') label = other[1].capitalize() p_pos = parse_pos[p_pos] par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos)[0] rl_par = chi_node.parent_relation if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \ or not rl_par: p = par_node.lemid + '&SemanticLabel=' + label p += '&Type=Derivation' parents.append(p) if parents: chi_node.misc['other_parents'] = '|'.join(parents) if entry['ref_roots']: roots = list() for ref in entry['ref_roots']: p_form, p_pos = ref.split('_') p_pos = parse_pos[p_pos] par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos)[0] if par_node.lemid != chi_node.lemid: roots.append(par_node.lemid) if roots: chi_node.misc['was_in_family_with'] = '&'.join(roots) return lexicon
def process(self, lexicon: Lexicon): """Build DerivBaseHR to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) parse_pos = {'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ'} # add lemmas and morphological features for entry in harm: lid = entry['form'] + '#' + parse_pos[entry['pos']] lexicon.create_lexeme(lemma=entry['form'], pos=parse_pos[entry['pos']], lemid=lid) # add main relations and rules, # add other derivational relations and rules, # add references to splitted families for entry in harm: c_pos = parse_pos[entry['pos']] c_lid = entry['form'] + '#' + c_pos chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos, lemid=c_lid)[0] if entry['parent']: p_form, p_pos = entry['parent'][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] lexicon.add_derivation(source=par_node, target=chi_node) if entry['ref_roots']: roots = list() for ref in entry['ref_roots']: p_form, p_pos = ref.split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] if par_node.lemid != chi_node.lemid: roots.append(par_node.lemid) if roots: chi_node.misc['was_in_family_with'] = '&'.join(roots) return lexicon
def process(self, lexicon: Lexicon): """Build Serbo-Croatian Etymological WordNet to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) # add lemmas and morphological features for entry in harm: lexicon.create_lexeme(lemma=entry['form'], pos='') # add main derivational relations, # add other derivational relations, # add references to splitted families for entry in harm: chi_node = lexicon.get_lexemes(lemma=entry['form'])[0] if entry['parent']: par_node = lexicon.get_lexemes(lemma=entry['parent'][0])[0] lexicon.add_derivation(source=par_node, target=chi_node) if entry['others']: # TODO: change place to 9th colummn parents = list() for other in entry['others']: par_node = lexicon.get_lexemes(lemma=other[0])[0] rl_par = chi_node.parent_relation if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \ or not rl_par: parents.append(par_node.lemid + '&Type=Derivation') if parents: chi_node.misc['other_parents'] = '|'.join(parents) if entry['ref_roots']: roots = list() for ref in entry['ref_roots']: par_node = lexicon.get_lexemes(lemma=ref)[0] if par_node.lemid != chi_node.lemid: roots.append(par_node.lemid) if roots: chi_node.misc['was_in_family_with'] = '&'.join(roots) return lexicon
def process(self, lexicon: Lexicon): """Build DerIvaTario to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) parse_pos = { 'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ', 'D': 'ADV', 'E': 'X', 'X': 'X' } # add lemmas and morphological features for entry in harm: lid = entry['form'].lower() + '#' + parse_pos[entry['pos']] lexicon.create_lexeme(lemma=entry['form'].lower(), pos=parse_pos[entry['pos']], lemid=lid) # add main relations, # add original features for entry in harm: c_pos = parse_pos[entry['pos']] c_lid = entry['form'].lower() + '#' + c_pos chi_node = lexicon.get_lexemes(lemma=entry['form'].lower(), pos=c_pos, lemid=c_lid)[0] if entry['parent']: p_form, p_pos = entry['parent'][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form.lower() + '#' + p_pos par_node = lexicon.get_lexemes(lemma=p_form.lower(), pos=p_pos, lemid=p_lid)[0] lexicon.add_derivation(source=par_node, target=chi_node) orig = entry['orig'].split(';') orig_id = int(orig[0]) orig_sg = [i for i in orig[2:-1] if i] chi_node.misc['original_id'] = orig_id chi_node.misc['segmentation'] = orig_sg return lexicon
def process(self, lexicon: Lexicon): """Build Latin WFL to DeriNet format.""" def parse_lemmas(l_lem, l_pos): parse_pos = { 'A': 'ADJ', 'N': 'NOUN', 'V': 'VERB', 'I': 'X', 'P': 'PRON', 'U': 'AUX' } gend_parse = {'m': 'Masc', 'f': 'Fem', 'n': 'Neut'} pos, gend, _, wid = l_pos.split('_') feat = {} if pos[0] == 'N': if gend in ('m', 'f', 'n'): feat['Gender'] = gend_parse[gend] if len(pos) > 1: feat['Declension'] = pos[1] elif pos[0] == 'A' and len(pos) > 1: feat['AdjClass'] = pos[1] elif pos[0] == 'V': if len(pos) <= 1: pass elif pos[1] in ('1', '2', '3', '4', '5'): feat['Conjugation'] = pos[1] elif pos[1] == 'A': pos = 'U' lid = l_lem + '#' + parse_pos[pos[0]] + '#' + wid return parse_pos[pos[0]], feat, lid # load data harm = pickle.load(open(self.fname, 'rb')) # add lemmas, morphological features and segmentation for entry in harm: pos, feat, lid = parse_lemmas(entry['form'], entry['pos']) # check presence in the lexicon (due to compounds) present = lexicon.get_lexemes(lemma=entry['form'], pos=pos, lemid=lid) if len(present) == 0: lexicon.create_lexeme(lemma=entry['form'], pos=pos, feats=feat, lemid=lid) # add main relations and used afix, # add other derivational relations and used afix, # add references to splitted families, # add compounding for entry in harm: c_pos, _, c_lid = parse_lemmas(entry['form'], entry['pos']) chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos, lemid=c_lid)[0] if entry['parent']: parse = entry['parent'][0][0].split('_') p_form = parse[0] p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:])) par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] afix = entry['parent'][1][3] typ = entry['parent'][1][2].replace('Derivation_', '') if typ in ('Prefix', 'Suffix'): lexicon.add_derivation(source=par_node, target=chi_node) chi_node.parent_relation.feats[typ] = afix elif typ == 'Conversion': lexicon.add_conversion(source=par_node, target=chi_node) if entry['others']: # TODO: change place to 9th colummn;conversion parents = list() for other in entry['others']: parse = other[0][0].split('_') p_form = parse[0] p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:])) afix = other[1][3] typ = other[1][2].replace('Derivation_', '') par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] rl_par = chi_node.parent_relation if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \ or not rl_par: if typ in ('Prefix', 'Suffix'): p = par_node.lemid + '&' + typ + '=' + afix p += '&Type=Derivation' parents.append(p) else: parents.append(par_node.lemid + '&Type=' + typ) if parents: chi_node.misc['other_parents'] = '|'.join(parents) if entry['ref_roots']: roots = list() for ref in entry['ref_roots']: parse = ref.split('_') p_form = parse[0] p_pos, _, p_lid = parse_lemmas(p_form, '_'.join(parse[1:])) par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] if par_node.lemid != chi_node.lemid: roots.append(par_node.lemid) if roots: chi_node.misc['was_in_family_with'] = '&'.join(roots) if entry['compounding']: p1_parse = entry['compounding'][0][0].split('_') p1_form = p1_parse[0] p1_attr = '_'.join(p1_parse[1:]) p1_pos, p1_feat, p1_lid = parse_lemmas(p1_form, p1_attr) p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid) if len(p1_node) == 0: lexicon.create_lexeme(lemma=p1_form, pos=p1_pos, feats=p1_feat, lemid=p1_lid) p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid)[0] p2_parse = entry['compounding'][1][0].split('_') p2_form = p2_parse[0] p2_attr = '_'.join(p2_parse[1:]) p2_pos, p2_feat, p2_lid = parse_lemmas(p2_form, p2_attr) p2_node = lexicon.get_lexemes(lemma=p2_form, pos=p2_pos, lemid=p2_lid) if len(p2_node) == 0: lexicon.create_lexeme(lemma=p2_form, pos=p2_pos, feats=p2_feat, lemid=p2_lid) p2_node = lexicon.get_lexemes(lemma=p2_form, pos=p2_pos, lemid=p2_lid)[0] if p1_node == p2_node or not p1_node or not p2_node: continue lexicon.add_composition([p1_node, p2_node], p1_node, chi_node) return lexicon
def process(self, lexicon: Lexicon): """Build DerivBase to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) parse_pos = { 'V': ('VERB', None), 'Nn': ('NOUN', 'Gender:Neut'), 'Nf': ('NOUN', 'Gender:Fem'), 'Nm': ('NOUN', 'Gender:Masc'), 'N': ('NOUN', None), 'A': ('ADJ', None) } # add lemmas and morphological features for entry in harm: feat = {} if parse_pos[entry['pos']][1] is not None: for f in parse_pos[entry['pos']][1:]: key, value = f.split(':') feat[key] = value lid = entry['form'] + '#' + parse_pos[entry['pos']][0] if parse_pos[entry['pos']][1]: lid += '#' + parse_pos[entry['pos']][1].replace('Gender:', '') lexicon.create_lexeme(lemma=entry['form'], pos=parse_pos[entry['pos']][0], feats=feat, lemid=lid) # add main relations and rules, # add other derivational relations and rules, # add references to splitted families for entry in harm: c_pos = parse_pos[entry['pos']] c_lid = entry['form'] + '#' + c_pos[0] if c_pos[1]: c_lid += '#' + c_pos[1].replace('Gender:', '') chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos[0], lemid=c_lid)[0] if entry['parent']: p_form, p_pos = entry['parent'][0][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos[0] if p_pos[1]: p_lid += '#' + p_pos[1].replace('Gender:', '') par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0], lemid=p_lid)[0] lexicon.add_derivation(source=par_node, target=chi_node) rule = entry['parent'][1].replace('>', '') chi_node.parent_relation.feats['Rule'] = rule if entry['others']: # TODO: change place to 9th colummn parents = list() for other in entry['others']: p_form, p_pos = other[0][0].split('_') rule = other[1].replace('>', '') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos[0] if p_pos[1]: p_lid += '#' + p_pos[1].replace('Gender:', '') par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0], lemid=p_lid)[0] rl_par = chi_node.parent_relation if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \ or not rl_par: p = par_node.lemid + '&Rule=' + rule p += '&Type=Derivation' parents.append(p) if parents: chi_node.misc['other_parents'] = '|'.join(parents) if entry['ref_roots']: roots = list() for ref in entry['ref_roots']: p_form, p_pos = ref.split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos[0] if p_pos[1]: p_lid += '#' + p_pos[1].replace('Gender:', '') par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0], lemid=p_lid)[0] if par_node.lemid != chi_node.lemid: roots.append(par_node.lemid) if roots: chi_node.misc['was_in_family_with'] = '&'.join(roots) return lexicon
def process(self, lexicon: Lexicon): """Build GCelex to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) parse_pos = { 'N': 'NOUN', 'V': 'VERB', 'A': 'ADJ', 'D': 'ADV', 'X': 'X', 'C': 'NUM', 'P': 'ADP' } # add lemmas and morphological features for entry in harm: oid, form = entry['form'].split('_') lid = form + '#' + parse_pos[entry['pos']] + '#' + oid lexicon.create_lexeme(lemma=form, pos=parse_pos[entry['pos']], lemid=lid) # add main relations, # add original features, # add compounds for entry in harm: c_pos = parse_pos[entry['pos']] oid, form = entry['form'].split('_') c_lid = form + '#' + c_pos + '#' + oid chi_node = lexicon.get_lexemes(lemma=form, pos=c_pos, lemid=c_lid)[0] if entry['parent']: p_oid, p_form, p_pos = entry['parent'][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos + '#' + p_oid par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] lexicon.add_derivation(source=par_node, target=chi_node) # features orig = entry['orig'].split('#') if len(orig) > 0 and orig != ['']: orig_hierarch = orig[0] chi_node.misc['segmentation_hierarch'] = orig_hierarch if len(orig) > 1: orig_flat = orig[1] chi_node.misc['segmentation'] = orig_flat if len(orig) > 2: orig_morphs = orig[2] chi_node.misc['morpheme_order'] = orig_morphs # compounds if entry['compounding']: # parent 1 p1_oid, p1_form, p1_pos = entry['compounding'][0][0].split('_') p1_pos = parse_pos[p1_pos] p1_lid = '#'.join([p1_oid, p1_form, p1_pos]) p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid) if len(p1_node) == 0: lexicon.create_lexeme(lemma=p1_form, pos=p1_pos, lemid=p1_lid) # features p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid)[0] orig = entry['compounding'][0][1].split('#') if len(orig) > 0 and orig != ['']: orig_hierarch = orig[0] p1_node.misc['segmentation_hierarch'] = orig_hierarch if len(orig) > 1: orig_flat = orig[1] p1_node.misc['segmentation'] = orig_flat if len(orig) > 2: orig_morphs = orig[2] p1_node.misc['morpheme_order'] = orig_morphs p1_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid)[0] # parent 2 p2_oid, p2_form, p2_pos = entry['compounding'][1][0].split('_') p2_pos = parse_pos[p2_pos] p2_lid = '#'.join([p2_oid, p2_form, p2_pos]) p2_node = lexicon.get_lexemes(lemma=p2_form, pos=p2_pos, lemid=p2_lid) if len(p2_node) == 0: lexicon.create_lexeme(lemma=p2_form, pos=p2_pos, lemid=p2_lid) # features p2_node = lexicon.get_lexemes(lemma=p1_form, pos=p1_pos, lemid=p1_lid)[0] orig = entry['compounding'][1][1].split('#') if len(orig) > 0 and orig != ['']: orig_hierarch = orig[0] p2_node.misc['segmentation_hierarch'] = orig_hierarch if len(orig) > 1: orig_flat = orig[1] p2_node.misc['segmentation'] = orig_flat if len(orig) > 2: orig_morphs = orig[2] p2_node.misc['morpheme_order'] = orig_morphs p2_node = lexicon.get_lexemes(lemma=p2_form, pos=p2_pos, lemid=p2_lid)[0] if p1_node == p2_node or not p1_node or not p2_node: continue lexicon.add_composition([p1_node, p2_node], p1_node, chi_node) return lexicon
def process(self, lexicon: Lexicon): """Build Démonette to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) parse_pos = {'Vmn----': ('VERB', None), 'Ncms': ('NOUN', 'Gender:Masc', 'Number:Sing'), 'Ncmp': ('NOUN', 'Gender:Masc', 'Number:Plur'), 'Ncfs': ('NOUN', 'Gender:Fem', 'Number:Sing'), 'Ncfp': ('NOUN', 'Gender:Fem', 'Number:Plur'), 'Afpms': ('ADJ', 'Gender:Masc', 'Number:Sing', 'AdjType:Qualif', 'Degree:Pos')} # add lemmas, morphological features and segmentation for entry in harm: feat = {} if parse_pos[entry['pos']][1] is not None: for f in parse_pos[entry['pos']][1:]: key, value = f.split(':') feat[key] = value lid = entry['form'] + '#' + parse_pos[entry['pos']][0] if parse_pos[entry['pos']][0] == 'NOUN': lid += '#' + parse_pos[entry['pos']][1].replace('Gender:', '') lexicon.create_lexeme(lemma=entry['form'], pos=parse_pos[entry['pos']][0], feats=feat, lemid=lid) if entry['seg'] != {''}: c_pos = parse_pos[entry['pos']] c_lid = entry['form'] + '#' + parse_pos[entry['pos']][0] if c_pos[0] == 'NOUN': c_lid += '#' + c_pos[1].replace('Gender:', '') chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos[0], lemid=c_lid)[0] segmentations = tuple(entry['seg'])[0].split('#') seg = list() for s in segmentations: _, afix, _ = s.split('|') if afix not in seg: seg.append(afix) chi_node.misc['suffix'] = '|'.join(seg) # add main relations and semantic labels, # add other derivational relations and semantic labels, # add references to splitted families paradigm_lexemes = set() for entry in harm: c_pos = parse_pos[entry['pos']] c_lid = entry['form'] + '#' + parse_pos[entry['pos']][0] if c_pos[0] == 'NOUN': c_lid += '#' + c_pos[1].replace('Gender:', '') chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos[0], lemid=c_lid)[0] if entry['parent']: p_form, p_pos = entry['parent'][0][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos[0] if p_pos[0] == 'NOUN': p_lid += '#' + p_pos[1].replace('Gender:', '') par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0], lemid=p_lid)[0] lexicon.add_derivation(source=par_node, target=chi_node) label = entry['parent'][1].replace('@', '').split('#')[1] label = label.replace('|', '+') chi_node.parent_relation.feats['SemanticLabel'] = label if entry['others']: # TODO: change place to 9th colummn parents = list() for other in entry['others']: p_form, p_pos = other[0][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos[0] if p_pos[0] == 'NOUN': p_lid += '#' + p_pos[1].replace('Gender:', '') par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0], lemid=p_lid)[0] label = other[1].replace('@', '').split('#')[1] label = label.replace('|', '+') rl_par = chi_node.parent_relation if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \ or not rl_par: p = par_node.lemid + '&SemanticLabel=' + label p += '&Type=Derivation' parents.append(p) if parents: chi_node.misc['other_parents'] = '|'.join(parents) if entry['ref_roots']: roots = list() for ref in entry['ref_roots']: p_form, p_pos = ref.split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos[0] if p_pos[0] == 'NOUN': p_lid += '#' + p_pos[1].replace('Gender:', '') par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0], lemid=p_lid)[0] if par_node.lemid != chi_node.lemid: roots.append(par_node.lemid) if roots: chi_node.misc['was_in_family_with'] = '&'.join(roots) if entry['inparadigm'] != {''}: [paradigm_lexemes.add(i) for i in entry['inparadigm']] # paradigm = list() # for pdg in tuple(entry['inparadigm'])[0].split('|'): # p_form, p_pos = pdg.split('_') # p_pos = parse_pos[p_pos] # p_lid = p_form + '#' + p_pos[0] # if p_pos[0] == 'NOUN': # p_lid += '#' + p_pos[1].replace('Gender:', '') # par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos[0], # lemid=p_lid) # if par_node: # paradigm.append(par_node[0].lemid) # chi_node.misc['in_subparadigm_with'] = '&'.join(paradigm) # add the rest of lexemes related without any direction for lemma in paradigm_lexemes: if not lemma: continue lemma, pos = lemma.split('_') feat = {} if parse_pos[pos][1] is not None: for f in parse_pos[pos][1:]: key, value = f.split(':') feat[key] = value lid = lemma + '#' + parse_pos[pos][0] if parse_pos[pos][0] == 'NOUN': lid += '#' + parse_pos[pos][1].replace('Gender:', '') lexemes = lexicon.get_lexemes(lemma=lemma, pos=parse_pos[pos][0]) if len(lexemes) == 0: lexicon.create_lexeme(lemma=lemma, pos=parse_pos[pos][0], feats=feat, lemid=lid) return lexicon
import sys import argparse from collections import defaultdict sys.path.append(os.path.realpath('../../../../tools/data-api/derinet2/')) from derinet import Lexicon # set argparse parser = argparse.ArgumentParser() parser.add_argument('--DeriNet', action='store', dest='csder', required=True) parser.add_argument('--Cognates1', action='store', dest='cog1', required=True) parser.add_argument('--Cognates2', action='store', dest='cog2', required=True) par = parser.parse_args() # load derinet cs_derinet = Lexicon() cs_derinet.load(par.csder) # load list of cognates cognates = defaultdict(bool) for path in (par.cog1, par.cog2): with open(path, mode='r', encoding='U8') as f: for line in f: cognates[line.strip()] = True # find families affected by cognates affected_families = set() for cognate in list(cognates): lexemes = cs_derinet.get_lexemes(lemma=cognate) if len(lexemes) == 0: continue
parser.add_argument('--DeriNet', action='store', dest='csder', required=True) parser.add_argument('--Loanwords', action='store', dest='loan', required=True) parser.add_argument('--Output', action='store', dest='output', required=True) par = parser.parse_args() # load loanword marks loanwords = OrderedDict() with open(par.loan, mode='r', encoding='U8') as f: for line in f: lemma, tag, mark = line.rstrip('\n').split('\t') loanwords['_'.join([lemma, tag])] = bool(mark.replace('False', '')) # load derinet cs_derinet = Lexicon() cs_derinet.load(par.csder) # correct loanword marks for lexeme, mark in loanwords.items(): # find lexeme in derinet node = cs_derinet.get_lexemes(lemma=lexeme.split('_')[0], pos=lexeme.split('_')[1])[0] # propriums and their subtrees are FALSE if node.lemma[0].isupper(): loanwords['_'.join([node.lemma, node.pos])] = False for node_child in node.iter_subtree(): loanwords['_'.join([node_child.lemma, node_child.pos])] = False continue
import argparse from collections import defaultdict sys.path.append(os.path.realpath('../../../../tools/data-api/derinet2/')) from derinet import Lexicon # set argparse parser = argparse.ArgumentParser() parser.add_argument('--DeriNet', action='store', dest='csder', required=True) par = parser.parse_args() # load derinet cs_derinet = Lexicon() cs_derinet.load(par.csder) # find families of Foreign foreign_lexemes = defaultdict() affected_families = set() for lexeme in cs_derinet.iter_lexemes(): if lexeme.feats.get('Foreign', False): foreign_lexemes[lexeme.lemma] = True affected_families.add(lexeme.get_tree_root()) # print families of relevant cognates for root in affected_families: for lexeme in root.iter_subtree(): lexeme_mark = lexeme.feats.get('Loanword', False)
def process(self, lexicon: Lexicon): """Build DerivBaseRU to DeriNet format.""" # load data harm = pickle.load(open(self.fname, 'rb')) parse_pos = { 'V': 'VERB', 'N': 'NOUN', 'D': 'ADV', 'A': 'ADJ', 'C': 'NUM' } # add lemmas and morphological features for entry in harm: lid = entry['form'] + '#' + parse_pos[entry['pos']] lexicon.create_lexeme(lemma=entry['form'], pos=parse_pos[entry['pos']], lemid=lid) # add main relations and rules, # add other derivational relations and rules, # add references to splitted families for entry in harm: c_pos = parse_pos[entry['pos']] c_lid = entry['form'] + '#' + c_pos chi_node = lexicon.get_lexemes(lemma=entry['form'], pos=c_pos, lemid=c_lid)[0] if entry['parent']: p_form, p_pos = entry['parent'][0][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] lexicon.add_derivation(source=par_node, target=chi_node) # add rules rules, proc = list(), list() for item in entry['parent'][1].split('#'): rul, pr = item.split('&') rules.append(re.search(r'rule([0-9]*)', rul).group(1)) proc += pr.split(',') chi_node.parent_relation.feats['Rule'] = ','.join(rules) chi_node.parent_relation.feats['Process'] = ','.join(set(proc)) if entry['others']: # TODO: change place to 9th colummn parents = list() for other in entry['others']: p_form, p_pos = other[0][0].split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] rules, proc = list(), list() for item in other[1].split('#'): rul, pr = item.split('&') rules.append(re.search(r'rule([0-9]*)', rul).group(1)) proc += pr.split(',') rules = ','.join(rules) proc = ','.join(set(proc)) rl_par = chi_node.parent_relation if (rl_par and par_node.lemid != rl_par.sources[0].lemid) \ or not rl_par: p = par_node.lemid + '&Rule=' + rules p += '&Process=' + proc + '&Type=Derivation' parents.append(p) if parents: chi_node.misc['other_parents'] = ','.join(parents) if entry['ref_roots']: roots = list() for ref in entry['ref_roots']: p_form, p_pos = ref.split('_') p_pos = parse_pos[p_pos] p_lid = p_form + '#' + p_pos par_node = lexicon.get_lexemes(lemma=p_form, pos=p_pos, lemid=p_lid)[0] if par_node.lemid != chi_node.lemid: roots.append(par_node.lemid) if roots: chi_node.misc['was_in_family_with'] = '&'.join(roots) return lexicon