def get_feats_noun(self): ''' Finds features for all nouns and proper names # TODO: add full proper name support (not found in DMII) ''' tag = self.tag token = self.token UD_tag = self.UD_tag tag_name, tag_info = self.tag_name, self.tag_info if tag_info: self.features.Case = self.feats[UD_tag]['Case'][tag_info] else: self.features.Case = None self.features.Number = self.feats[UD_tag]['Number'][tag_name] try: ID = DMII_data.check_DMII(DMII_no, token, self.lemma) self.features.Gender = self.feats[UD_tag]['Gender'][ID[1]] if ID[0].endswith('gr'): self.features.Definite = 'Def' else: self.features.Definite = 'Ind' return self except (TypeError, IndexError, KeyError) as err: # except Exception as err: self.features.ERROR = error_info(err) return self
def get_feats_num(self): ''' Finds features for numerals ''' tag = self.tag token = self.token UD_tag = self.UD_tag try: ID = DMII_data.check_DMII(DMII_to, token, self.lemma)[0] mark = ID.split('_')[1] self.features.Gender = self.feats[UD_tag]['Gender'][ID.split('_')[0]] self.features.Number = self.feats[UD_tag]['Number'][mark[-2:]] if self.tag_name[-1] == 'P': self.features.NumType = self.feats[UD_tag]['NumType']['P'] return self else: self.features.NumType = self.feats[UD_tag]['NumType']['O'] return self except (TypeError, KeyError): #ef orðið finnst ekki if self.tag_name[-1] == 'P': self.features.NumType = self.feats[UD_tag]['NumType']['P'] return self else: self.features.NumType = self.feats[UD_tag]['NumType']['O'] return self
def feats_verb_else(self): ''' Finds features for all 'other' verbs ''' tag = self.tag token = self.token UD_tag = self.UD_tag try: ID = DMII_data.check_DMII(DMII_so, token, self.lemma)[0] if ID.startswith('OP'): #upplýsingar um ópersónulega beygingu teknar út ID = re.sub('OP-', '', ID) self.features.Tense = self.feats[UD_tag]['Tense'][ID.split('-')[2]] self.features.Mood = self.feats[UD_tag]['Mood'][ID.split('-')[1]] self.features.Voice = self.feats[UD_tag]['Voice'][ID.split('-')[0]] self.features.Person = self.feats[UD_tag]['Person'][ID.split('-')[3]] self.features.Number = self.feats[UD_tag]['Number'][ID.split('-')[4]] return self except (TypeError, KeyError, IndexError): #ef orð finnst ekki í BÍN eru upplýsingar frá Icepahc notaðar if tag[2] == 'D': self.features.Tense = self.feats[UD_tag]['Tense']['ÞT'] elif tag[2] == 'P': self.features.Tense = self.feats[UD_tag]['Tense']['NT'] if tag[3] == 'I': self.features.Mood = self.feats[UD_tag]['Mood']['FH'] elif tag[3] == 'S': self.features.Mood = self.feats[UD_tag]['Mood']['VH'] return self
def process_DMII(): make_out_dir() flags = ['no', 'lo', 'fn', 'to', 'ao', 'so', 'combined'] for flag in flags: # print('Processing DMII data for {0}.csv...'.format(flag)) data = DMII_data.DMII_data(flag) json_filename = 'DMII_{0}.json'.format(flag) json_filepath = os.path.join(json_dir, json_filename) print('Writing data to ' + json_filename) write_json(data, json_filepath) print('Finished!')
def word_info(tree): ''' Takes in a nltk Tree object and returns an approximation of the tree sentence in the CONLLU format for UD: ID: Word index, integer starting at 1 for each new sentence. FORM: Word form or punctuation symbol. LEMMA: Lemma or stem of word form. UPOS: Universal part-of-speech tag. XPOS: Language-specific part-of-speech tag; underscore if not available. FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available. HEAD: Head of the current word, which is either a value of ID or zero (0). DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one. DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs. MISC: Any other annotation. ''' sentence = [] runner = 0 # print(tree.leaves()) sentence.append(sent_text(tree)) sentence.append(['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']) for leaf in tree.pos(): runner += 1 # runner updated for counting ID = str(runner) # ID: Word index, integer starting at 1 for each new sentence. if '-' in leaf[0]: # if token and lemma present word = leaf[0].split('-') # token and lemma separated. FORM = word[0] # FORM: Word form or punctuation symbol (token). LEMMA = word[1] elif leaf[0] == '<dash/>' or leaf[0] == '<dash>' or leaf[0] == '</dash>': FORM = '-' LEMMA = '-' token_lemma = str(FORM+'-'+LEMMA) tag = leaf[1] leaf = token_lemma, tag else: # if no lemma present FORM = leaf[0] if FORM[0] not in ['*', '0']: # DMII_combined = DMII_data.DMII_data('combined') LEMMA = DMII_data.get_lemma(DMII_combined, FORM) if LEMMA == None: LEMMA = '_' token_lemma = str(FORM+'-'+LEMMA) tag = leaf[1] leaf = token_lemma, tag if FORM[0] in ['*', '0']: continue XPOS = leaf[1] # XPOS: Language-specific part-of-speech tag (IcePaHC) UPOS = features.get_UD_tag(XPOS, LEMMA) # UPOS: Universal part-of-speech tag. FEATS = features.get_feats(leaf) # FEATS: List of morphological features from the universal feature inventory HEAD = '_' # HEAD: Head of the current word, which is either a value of ID or zero (0). DEPREL = '_' # DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) DEPS = '_' # DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs. MISC = '_' # MISC: Any other annotation. line = [str(runner), FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC] sentence.append(line) return sentence
def get_feats_adj(self): ''' Finds features for all adjectives ''' tag = self.tag token = self.token UD_tag = self.UD_tag try: if self.tag_name[-1] == 'R': self.features.Degree = self.feats[UD_tag]['Degree']['R'] elif self.tag_name[-1] == 'S': self.features.Degree = self.feats[UD_tag]['Degree']['S'] else: self.features.Degree = self.feats[UD_tag]['Degree']['P'] ID = DMII_data.check_DMII(DMII_lo, self.token, self.lemma)[0] self.features.Gender = self.feats[UD_tag]['Gender'][ID.split('-')[1]] self.features.Number = self.feats[UD_tag]['Number'][ID.split('-')[2][-2:]] return self except KeyError as err: self.features.ERROR = error_info(err) return self except TypeError as err: #handles mismatch between word class analysis in Icepahc and BÍN, quantifiers tagged as ADJ in UD, WIP for pronouns tagged as ADJ in UD? try: ID = DMII_data.check_DMII(DMII_fn, self.token, self.lemma)[0] if '-' in ID: self.features.Gender = self.feats[UD_tag]['Gender'][ID.split('-')[0]] self.features.Number = self.feats[UD_tag]['Number'][(ID.split('-')[1])[-2:]] return self elif '_' in ID: self.features.Gender = self.feats[UD_tag]['Gender'][ID.split('_')[1]] self.features.Number = self.feats[UD_tag]['Number'][(ID.split('_')[2])[-2:]] return self else: self.features.Number = self.feats[UD_tag]['Number'][ID[-2:]] self.features.ERROR = error_info(err) return self except (TypeError, KeyError) as err: self.features.ERROR = error_info(err) return self
from lib import DMII_data from lib.depender import Converter import nltk >= 3.4.5 from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import CategorizedBracketParseCorpusReader from nltk.data import path from nltk.tree import * from collections import defaultdict import time import re import string path.extend(['./testing/']) DMII_combined = DMII_data.load_json('combined') # TODO: Move to features script icepahc = LazyCorpusLoader( 'icepahc-v0.9/psd/', CategorizedBracketParseCorpusReader, r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*' ) if __name__ == '__main__': fileids = icepahc.fileids() # leave uncommented for whole corpus use #fileids = ['1350.bandamennM.nar-sag.psd'] # For debug use only c = Converter() # Creates instance of Converter class total_sents = 0 file_num = 1 # f = open('homilia_conllu.conllu', 'w+')
Hinrik Hafsteinsson ([email protected]) Þórunn Arnardóttir ([email protected]) 2019 Part of UniTree project for IcePaHC ''' from lib import DMII_data from lib.rules import tags, feats, cconj import string import re import nltk >= 3.4.5 from nltk.tree import Tree from collections import OrderedDict import inspect DMII_no = DMII_data.DMII_data('no') DMII_lo = DMII_data.DMII_data('lo') DMII_fn = DMII_data.DMII_data('fn') DMII_to = DMII_data.DMII_data('to') DMII_ao = DMII_data.DMII_data('ao') DMII_so = DMII_data.DMII_data('so') def error_info(err): ''' Debug method Returns the current line number in our program and type of exception. ''' err = str(type(err).__name__) return err+' Loc: '+str(inspect.currentframe().f_back.f_lineno) class IcelandicUDFeatures:
def create_dependency_graph(self, tree): """Create a dependency graph from a phrase structure tree.""" const = [] tag_list = {} nr = 1 # Tree item read in as string and transferred to UD graph instance t = IndexedTree.fromstring(tree) self.dg = UniversalDependencyGraph() for i in t.treepositions(): if isinstance(t[i], Tree): if len(t[i]) == 1: # If terminal node with label # e.g. (VBDI tók-taka) or (NP-SBJ (PRO-N hann-hann)) tag_list[nr] = t[i].label() t[i].set_id(nr) # print(t[i]) else: # If constituent / complex phrase # e.g. (ADVP (ADV smám-smám) (ADV saman-saman)) t[i].set_id(0) const.append(i) else: # If trace node, skip (preliminary, may result in errors) # e.g. *T*-3 etc. if t[i][0] in {'0', '*', '{'}: #if t[1].pos()[0][0] in {'0', '*'}: continue # If terminal node with no label (token-lemma) # e.g. tók-taka if '-' in t[i]: FORM, LEMMA = t[i].split('-', 1) tag = tag_list[nr] # If <dash/>, <dash> or </dash> elif t[i][0] in {'<dash/>', '<dash>', '</dash>'}: FORM = LEMMA = '-' tag = tag_list[nr] else: # If no lemma present FORM = t[i][0] #DMII_combined = f.DMII_data('combined') LEMMA = DMII_data.get_lemma(DMII_combined, FORM) # LEMMA = '_' if LEMMA == None: LEMMA = '_' token_lemma = str(FORM+'-'+LEMMA) tag = tag_list[nr] if '+' in tag: tag = re.sub('\w+\+', '', tag) token_lemma = str(FORM+'-'+LEMMA) leaf = token_lemma, tag XPOS = tag # Feature Classes called here leaf = f.Word(leaf).getinfo() UPOS = leaf.UD_tag FEATS = leaf.features.featString() self.dg.add_node({'address': nr, 'word': FORM, 'lemma': LEMMA, 'ctag': UPOS, # upostag 'tag': XPOS, # xpostag 'feats': FEATS, 'deps': defaultdict(list), 'rel': None}) nr += 1 # go through the constituencies (bottom up) and find their heads const.sort(key=lambda x: len(x), reverse=True) for i in const: self._select_head(t[i]) for i in const: head_tag = t[i].label() head_nr = t[i].id() for child in t[i]: mod_tag = child.label() mod_nr = child.id() # if head_nr == mod_nr and re.match("NP-PRD", head_tag): #ath. virkar þetta rétt? Leið til að láta sagnfyllingu cop vera rót # self.dg.get_by_address(mod_nr).update({'head': 0, 'rel': 'root'}) # self.dg.root = self.dg.get_by_address(mod_nr) if child: if head_nr == mod_nr and re.match( "IP-MAT.*", head_tag): #todo root phrase types from config self.dg.get_by_address(mod_nr).update({'head': 0, 'rel': 'root'}) #todo copula not a head self.dg.root = self.dg.get_by_address(mod_nr) elif child[0] == '0' or '*' in child[0] or '{' in child[0] or '<' in child[0] or mod_tag == 'CODE': continue else: self.dg.get_by_address(mod_nr).update({'head': head_nr, 'rel': self._relation(mod_tag, head_tag)}) if head_nr != mod_nr: self.dg.add_arc(head_nr, mod_nr) return self.dg
# CONLLU_log.write(line) # CONLLU_log.write('\n') # else: # print('\t'.join(line)) # CONLLU_log.write('\t'.join(line)) # CONLLU_log.write('\n') # CONLLU_log.write('\n') total_sents += 1 file_sents += 1 except Exception as ex: # raise error_num += 1 print('\n# sent_id = ', treeID) print('Failure - {0}. Arguments:\n{1!r}'.format(type(ex).__name__, ex.args)) print(tree) # CONLLU_log.write('# sent_id = ' + treeID) # leave commented # CONLLU_log.write('\nFailure - {0}. Arguments:\n{1!r}\n\n'.format(type(ex).__name__, ex.args)) total_sents += 1 file_sents += 1 # continue end = time.time() duration = '%.2f' % float(end - start) print('Finished! Time elapsed: {0} seconds'.format(duration)) print('Number of sentences in file: {0}'.format(file_sents)) print('Number of failed sentences: {0}'.format(error_num)) DMII_combined = DMII_data.load_json('combined') if __name__ == '__main__': print_data()