Beispiel #1
0
def executeNER(buffered_sentences):
    new_sentences = []

    # load named entities
    NAMED_ENTITES = loadFromFile(open('ner_data/named_entities.data', 'r'))
    # load recommendations
    RECOMMENDATIONS = loadFromFile(open('ner_data/recommendations.data', 'r'))

    # for each input sentence
    for sentence in buffered_sentences:

        # connect parenthesis and quotes
        # name 's' for brevity sake
        s = PipelineUtils.connectParenthesis(PipelineUtils.connectQuotes(sentence))

        # ner loop
        for i in range(len(s)):

            # ----- NAMED ENTITIES -----
            # check named entities list
            if s[i][0] in NAMED_ENTITES:
                s[i][2] = 'kA'
                s[i][1] = s[i][0]
                if not s[i][0] in NER_STOPWORDS + NUM_FOLLOW + RECOMMENDATIONS:
                    s[i][0] += '_ACTOR'
            # komercni banka exception
            elif i > 0 and s[i-1][1] in ['komerční', 'Komerční'] and s[i][1] == 'banka':
                if not s[i-1][0].endswith('_ACTOR'):
                    s[i-1][0] += '_ACTOR'
                s[i][0] = 'banka_ACTOR'
                # copy tag value
                s[i-1][2] = s[i][2][:]
            # upper case first, other lower case
            elif i > 0 and s[i][0][0].isupper() and len(s[i][0]) > 2 and s[i][0][1].islower() and not s[i][0].lower() in RECOMMENDATIONS:
                s[i][1] = s[i][0]
                s[i][2] = 'kA'
                if not s[i][0] in NER_STOPWORDS + NUM_FOLLOW + RECOMMENDATIONS:
                    s[i][0] += '_ACTOR'
            # at least two upper case letters
            elif len(s[i][0]) > 1 and s[i][0].isupper():
                s[i][1] = s[i][0]
                s[i][2] = 'kA'
                if not s[i][0] in NER_STOPWORDS + NUM_FOLLOW + RECOMMENDATIONS:
                    s[i][0] += '_ACTOR'
            # first small, next upper
            elif len(s[i][0]) > 1 and s[i][0][0].islower() and s[i][0][1].isupper():
                s[i][1] = s[i][0]
                s[i][2] = 'kA'
                s[i][0] += '_ACTOR'
            # if is first and next token is '-' or ':'
            elif i == 0 and len(s) > 1 and (s[1][1] in '-:' or s[1][0].startswith('(')) and not s[0][1].lower() in ['akcie']:
                s[i][1] = s[i][0]
                s[i][2] = 'kA'
                s[i][0] += '_ACTOR'
            # if starts with ( and is upper
            elif s[i][0].startswith('('):
                s[i][1] = s[i][0]
                try:
                    if s[i][1].split('_')[0].isupper():
                        s[i][0] += '_ACTOR'
                except:
                    pass

            # ----- RECOMMENDATIONS -----
            # preposition/conjunction/comma + recommendation
            elif (s[i][1].lower() in RECOMMENDATIONS or s[i][0].lower() in RECOMMENDATIONS or (s[i][0].startswith('\"_') and len(s[i][1]) > 4 and len(s[i][1]) < 20)) and i > 0 and ('k7' in s[i-1][2] or 'k8' in s[i-1][2] or s[i-1][1] in ',;:'):
                if s[i][0].startswith('\"_'): s[i][0] = s[i][0][2:-2]
                s[i][1] = s[i][0]
                s[i][2] = 'kA'
                s[i][0] += '_STATE'
            # recommendation prefix + recommendation
            elif (s[i][1].lower() in RECOMMENDATIONS or s[i][0].lower() in RECOMMENDATIONS or (s[i][0].startswith('\"_') and len(s[i][1]) > 4 and len(s[i][1]) < 20)) and i > 0 and s[i-1][1].lower() in RECOMMENDATION_PREFIX:
                if s[i][0].startswith('\"_'): s[i][0] = s[i][0][2:-2]
                s[i][1] = s[i][0]
                s[i][2] = 'kA'
                s[i][0] += '_STATE'
            # 'nákupní', 'prodejní' + doporučení
            elif i + 1 < len(s) and s[i][1].lower() in ['nákupní', 'prodejní'] and s[i+1][1].lower() == 'doporučení':
                s[i][1] = s[i][0]
                s[i][2] = s[i+1][2]
                s[i][0] += '_STATE'
                s[i+1][0] += '_STATE'
            # in recommendation list and last in sentence
            elif i == len(s) - 2 and (s[i][0] in RECOMMENDATIONS or (s[i][0].startswith('\"_') and len(s[i][1]) > 4 and len(s[i][1]) < 20)):
                if s[i][0].startswith('\"_'): s[i][0] = s[i][0][2:-2]
                s[i][1] = s[i][0]
                s[i][2] = 'kA'
                s[i][0] += '_STATE'

            # ----- PRICE -----
            elif REAL_NUMBER_PATTERN.match(s[i][0]) and (i == 0 or (i > 0 and not s[i-1][1].lower() in NON_PRICE)):
                s[i][1] = s[i][0]
                s[i][2] = 'k4'
                s[i][0] += '_PRICE'

        # add to new sentences
        new_sentences.append(connectTokens(s))

    return new_sentences
Beispiel #2
0
#!/usr/bin/env python
# script for running NER
import sys

from lib.pipeline_utils import PipelineUtils
from lib.ner.ner import executeNER, changePOSTags

if len(sys.argv) > 1:
    # for development purposes on my PC
    f = open('../data/desamb_out_3', 'r')
    PipelineUtils.formatDesambOutput(changePOSTags(executeNER(PipelineUtils.bufferSentences(f.readlines()))))
else:
    PipelineUtils.formatDesambOutput(changePOSTags(executeNER(PipelineUtils.bufferSentences(sys.stdin.readlines()))))