Esempio n. 1
0
 def get_feats_noun(self):
     '''
     Finds features for all nouns and proper names
     # TODO: add full proper name support (not found in DMII)
     '''
     tag = self.tag
     token = self.token
     UD_tag = self.UD_tag
     tag_name, tag_info = self.tag_name, self.tag_info
     if tag_info:
         self.features.Case = self.feats[UD_tag]['Case'][tag_info]
     else:
         self.features.Case = None
     self.features.Number = self.feats[UD_tag]['Number'][tag_name]
     try:
         ID = DMII_data.check_DMII(DMII_no, token, self.lemma)
         self.features.Gender = self.feats[UD_tag]['Gender'][ID[1]]
         if ID[0].endswith('gr'):
             self.features.Definite = 'Def'
         else:
             self.features.Definite = 'Ind'
         return self
     except (TypeError, IndexError, KeyError) as err:
     # except Exception as err:
         self.features.ERROR = error_info(err)
         return self
Esempio n. 2
0
 def get_feats_num(self):
     '''
     Finds features for numerals
     '''
     tag = self.tag
     token = self.token
     UD_tag = self.UD_tag
     try:
         ID = DMII_data.check_DMII(DMII_to, token, self.lemma)[0]
         mark = ID.split('_')[1]
         self.features.Gender = self.feats[UD_tag]['Gender'][ID.split('_')[0]]
         self.features.Number = self.feats[UD_tag]['Number'][mark[-2:]]
         if self.tag_name[-1] == 'P':
             self.features.NumType = self.feats[UD_tag]['NumType']['P']
             return self
         else:
             self.features.NumType = self.feats[UD_tag]['NumType']['O']
             return self
     except (TypeError, KeyError):   #ef orðið finnst ekki
         if self.tag_name[-1] == 'P':
             self.features.NumType = self.feats[UD_tag]['NumType']['P']
             return self
         else:
             self.features.NumType = self.feats[UD_tag]['NumType']['O']
             return self
Esempio n. 3
0
 def feats_verb_else(self):
     '''
     Finds features for all 'other' verbs
     '''
     tag = self.tag
     token = self.token
     UD_tag = self.UD_tag
     try:
         ID = DMII_data.check_DMII(DMII_so, token, self.lemma)[0]
         if ID.startswith('OP'):     #upplýsingar um ópersónulega beygingu teknar út
             ID = re.sub('OP-', '', ID)
         self.features.Tense = self.feats[UD_tag]['Tense'][ID.split('-')[2]]
         self.features.Mood = self.feats[UD_tag]['Mood'][ID.split('-')[1]]
         self.features.Voice = self.feats[UD_tag]['Voice'][ID.split('-')[0]]
         self.features.Person = self.feats[UD_tag]['Person'][ID.split('-')[3]]
         self.features.Number = self.feats[UD_tag]['Number'][ID.split('-')[4]]
         return self
     except (TypeError, KeyError, IndexError):   #ef orð finnst ekki í BÍN eru upplýsingar frá Icepahc notaðar
         if tag[2] == 'D':
             self.features.Tense = self.feats[UD_tag]['Tense']['ÞT']
         elif tag[2] == 'P':
             self.features.Tense = self.feats[UD_tag]['Tense']['NT']
         if tag[3] == 'I':
             self.features.Mood = self.feats[UD_tag]['Mood']['FH']
         elif tag[3] == 'S':
             self.features.Mood = self.feats[UD_tag]['Mood']['VH']
         return self
Esempio n. 4
0
def process_DMII():
    make_out_dir()
    flags = ['no', 'lo', 'fn', 'to', 'ao', 'so', 'combined']
    for flag in flags:
        # print('Processing DMII data for {0}.csv...'.format(flag))
        data = DMII_data.DMII_data(flag)
        json_filename = 'DMII_{0}.json'.format(flag)
        json_filepath = os.path.join(json_dir, json_filename)
        print('Writing data to ' + json_filename)
        write_json(data, json_filepath)
        print('Finished!')
Esempio n. 5
0
def word_info(tree):
    '''
        Takes in a nltk Tree object and returns an approximation of the tree sentence
        in the CONLLU format for UD:
            ID: Word index, integer starting at 1 for each new sentence.
            FORM: Word form or punctuation symbol.
            LEMMA: Lemma or stem of word form.
            UPOS: Universal part-of-speech tag.
            XPOS: Language-specific part-of-speech tag; underscore if not available.
            FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
            HEAD: Head of the current word, which is either a value of ID or zero (0).
            DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one.
            DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs.
            MISC: Any other annotation.
    '''
    sentence = []
    runner = 0
    # print(tree.leaves())
    sentence.append(sent_text(tree))
    sentence.append(['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'])
    for leaf in tree.pos():
        runner += 1 # runner updated for counting
        ID = str(runner) # ID: Word index, integer starting at 1 for each new sentence.
        if '-' in leaf[0]:  # if token and lemma present
            word = leaf[0].split('-') # token and lemma separated.
            FORM = word[0] # FORM: Word form or punctuation symbol (token).
            LEMMA = word[1]
        elif leaf[0] == '<dash/>' or leaf[0] == '<dash>' or leaf[0] == '</dash>':
            FORM = '-'
            LEMMA = '-'
            token_lemma = str(FORM+'-'+LEMMA)
            tag = leaf[1]
            leaf = token_lemma, tag
        else:   # if no lemma present
            FORM = leaf[0]
            if FORM[0] not in ['*', '0']:
                # DMII_combined = DMII_data.DMII_data('combined')
                LEMMA = DMII_data.get_lemma(DMII_combined, FORM)
                if LEMMA == None:
                    LEMMA = '_'
                token_lemma = str(FORM+'-'+LEMMA)
                tag = leaf[1]
                leaf = token_lemma, tag
        if FORM[0] in ['*', '0']: continue
        XPOS = leaf[1] # XPOS: Language-specific part-of-speech tag (IcePaHC)
        UPOS = features.get_UD_tag(XPOS, LEMMA) # UPOS: Universal part-of-speech tag.
        FEATS = features.get_feats(leaf) # FEATS: List of morphological features from the universal feature inventory
        HEAD = '_' # HEAD: Head of the current word, which is either a value of ID or zero (0).
        DEPREL = '_' # DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0)
        DEPS = '_' # DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs.
        MISC = '_' # MISC: Any other annotation.
        line = [str(runner), FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
        sentence.append(line)
    return sentence
Esempio n. 6
0
 def get_feats_adj(self):
     '''
     Finds features for all adjectives
     '''
     tag = self.tag
     token = self.token
     UD_tag = self.UD_tag
     try:
         if self.tag_name[-1] == 'R':
             self.features.Degree = self.feats[UD_tag]['Degree']['R']
         elif self.tag_name[-1] == 'S':
             self.features.Degree = self.feats[UD_tag]['Degree']['S']
         else:
             self.features.Degree = self.feats[UD_tag]['Degree']['P']
         ID = DMII_data.check_DMII(DMII_lo, self.token, self.lemma)[0]
         self.features.Gender = self.feats[UD_tag]['Gender'][ID.split('-')[1]]
         self.features.Number = self.feats[UD_tag]['Number'][ID.split('-')[2][-2:]]
         return self
     except KeyError as err:
         self.features.ERROR = error_info(err)
         return self
     except TypeError as err:   #handles mismatch between word class analysis in Icepahc and BÍN, quantifiers tagged as ADJ in UD, WIP for pronouns tagged as ADJ in UD?
         try:
             ID = DMII_data.check_DMII(DMII_fn, self.token, self.lemma)[0]
             if '-' in ID:
                 self.features.Gender = self.feats[UD_tag]['Gender'][ID.split('-')[0]]
                 self.features.Number = self.feats[UD_tag]['Number'][(ID.split('-')[1])[-2:]]
                 return self
             elif '_' in ID:
                 self.features.Gender = self.feats[UD_tag]['Gender'][ID.split('_')[1]]
                 self.features.Number = self.feats[UD_tag]['Number'][(ID.split('_')[2])[-2:]]
                 return self
             else:
                 self.features.Number = self.feats[UD_tag]['Number'][ID[-2:]]
                 self.features.ERROR = error_info(err)
                 return self
         except (TypeError, KeyError) as err:
             self.features.ERROR =  error_info(err)
             return self
Esempio n. 7
0
from lib import DMII_data
from lib.depender import Converter

import nltk >= 3.4.5
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import CategorizedBracketParseCorpusReader
from nltk.data import path
from nltk.tree import *
from collections import defaultdict
import time
import re
import string

path.extend(['./testing/'])

DMII_combined = DMII_data.load_json('combined') # TODO: Move to features script

icepahc = LazyCorpusLoader(
    'icepahc-v0.9/psd/', CategorizedBracketParseCorpusReader,
    r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*'
)

if __name__ == '__main__':
    fileids = icepahc.fileids() # leave uncommented for whole corpus use
    #fileids = ['1350.bandamennM.nar-sag.psd'] # For debug use only
    c = Converter() # Creates instance of Converter class
    total_sents = 0
    file_num = 1

    # f = open('homilia_conllu.conllu', 'w+')
Esempio n. 8
0
Hinrik Hafsteinsson ([email protected])
Þórunn Arnardóttir ([email protected])
2019
Part of UniTree project for IcePaHC
'''

from lib import DMII_data
from lib.rules import tags, feats, cconj
import string
import re
import nltk >= 3.4.5
from nltk.tree import Tree
from collections import OrderedDict
import inspect

DMII_no = DMII_data.DMII_data('no')
DMII_lo = DMII_data.DMII_data('lo')
DMII_fn = DMII_data.DMII_data('fn')
DMII_to = DMII_data.DMII_data('to')
DMII_ao = DMII_data.DMII_data('ao')
DMII_so = DMII_data.DMII_data('so')

def error_info(err):
    '''
    Debug method
    Returns the current line number in our program and type of exception.
    '''
    err = str(type(err).__name__)
    return err+' Loc: '+str(inspect.currentframe().f_back.f_lineno)

class IcelandicUDFeatures:
Esempio n. 9
0
    def create_dependency_graph(self, tree):
        """Create a dependency graph from a phrase structure tree."""
        const = []
        tag_list = {}
        nr = 1
        # Tree item read in as string and transferred to UD graph instance
        t = IndexedTree.fromstring(tree)
        self.dg = UniversalDependencyGraph()

        for i in t.treepositions():
            if isinstance(t[i], Tree):
                if len(t[i]) == 1:
                    # If terminal node with label
                    # e.g. (VBDI tók-taka) or (NP-SBJ (PRO-N hann-hann))
                    tag_list[nr] = t[i].label()
                    t[i].set_id(nr)
                    # print(t[i])
                else:
                    # If constituent / complex phrase
                    # e.g. (ADVP (ADV smám-smám) (ADV saman-saman))
                    t[i].set_id(0)
                    const.append(i)
            else:
                # If trace node, skip (preliminary, may result in errors)
                # e.g. *T*-3 etc.
                if t[i][0] in {'0', '*', '{'}:   #if t[1].pos()[0][0] in {'0', '*'}:
                    continue
                # If terminal node with no label (token-lemma)
                # e.g. tók-taka
                if '-' in t[i]:
                    FORM, LEMMA = t[i].split('-', 1)
                    tag = tag_list[nr]
                # If <dash/>, <dash> or </dash>
                elif t[i][0] in {'<dash/>', '<dash>', '</dash>'}:
                    FORM = LEMMA = '-'
                    tag = tag_list[nr]
                else: # If no lemma present
                    FORM = t[i][0]
                    #DMII_combined = f.DMII_data('combined')
                    LEMMA = DMII_data.get_lemma(DMII_combined, FORM)    # LEMMA = '_'
                    if LEMMA == None:
                        LEMMA = '_'
                    token_lemma = str(FORM+'-'+LEMMA)
                    tag = tag_list[nr]
                if '+' in tag:
                    tag = re.sub('\w+\+', '', tag)
                token_lemma = str(FORM+'-'+LEMMA)
                leaf = token_lemma, tag
                XPOS = tag
                # Feature Classes called here
                leaf = f.Word(leaf).getinfo()
                UPOS = leaf.UD_tag
                FEATS = leaf.features.featString()
                self.dg.add_node({'address': nr,
                                  'word': FORM,
                                  'lemma': LEMMA,
                                  'ctag': UPOS, # upostag
                                  'tag': XPOS,   # xpostag
                                  'feats': FEATS,
                                  'deps': defaultdict(list),
                                  'rel': None})
                nr += 1

        # go through the constituencies (bottom up) and find their heads
        const.sort(key=lambda x: len(x), reverse=True)

        for i in const:
            self._select_head(t[i])

        for i in const:
            head_tag = t[i].label()
            head_nr = t[i].id()
            for child in t[i]:
                mod_tag = child.label()
                mod_nr = child.id()
#                if head_nr == mod_nr and re.match("NP-PRD", head_tag):      #ath. virkar þetta rétt? Leið til að láta sagnfyllingu cop vera rót
#                    self.dg.get_by_address(mod_nr).update({'head': 0, 'rel': 'root'})
#                    self.dg.root = self.dg.get_by_address(mod_nr)
                if child:
                    if head_nr == mod_nr and re.match( "IP-MAT.*", head_tag):  #todo root phrase types from config
                        self.dg.get_by_address(mod_nr).update({'head': 0, 'rel': 'root'})  #todo copula not a head
                        self.dg.root = self.dg.get_by_address(mod_nr)
                    elif child[0] == '0' or '*' in child[0] or '{' in child[0] or '<' in child[0] or mod_tag == 'CODE':
                        continue
                    else:
                        self.dg.get_by_address(mod_nr).update({'head': head_nr, 'rel': self._relation(mod_tag, head_tag)})
                    if head_nr != mod_nr:
                        self.dg.add_arc(head_nr, mod_nr)

        return self.dg
Esempio n. 10
0
                        # CONLLU_log.write(line)
                        # CONLLU_log.write('\n')
                    # else:
                        # print('\t'.join(line))
                        # CONLLU_log.write('\t'.join(line))
                        # CONLLU_log.write('\n')
                # CONLLU_log.write('\n')
                total_sents += 1
                file_sents += 1
            except Exception as ex:
                # raise
                error_num += 1
                print('\n# sent_id = ',  treeID)
                print('Failure - {0}. Arguments:\n{1!r}'.format(type(ex).__name__, ex.args))
                print(tree)
                # CONLLU_log.write('# sent_id = ' + treeID) # leave commented
                # CONLLU_log.write('\nFailure - {0}. Arguments:\n{1!r}\n\n'.format(type(ex).__name__, ex.args))
                total_sents += 1
                file_sents += 1
                # continue
        end = time.time()
        duration = '%.2f' % float(end - start)
        print('Finished! Time elapsed: {0} seconds'.format(duration))
        print('Number of sentences in file: {0}'.format(file_sents))
        print('Number of failed sentences: {0}'.format(error_num))

DMII_combined = DMII_data.load_json('combined')

if __name__ == '__main__':
    print_data()