def executeNER(buffered_sentences): new_sentences = [] # load named entities NAMED_ENTITES = loadFromFile(open('ner_data/named_entities.data', 'r')) # load recommendations RECOMMENDATIONS = loadFromFile(open('ner_data/recommendations.data', 'r')) # for each input sentence for sentence in buffered_sentences: # connect parenthesis and quotes # name 's' for brevity sake s = PipelineUtils.connectParenthesis(PipelineUtils.connectQuotes(sentence)) # ner loop for i in range(len(s)): # ----- NAMED ENTITIES ----- # check named entities list if s[i][0] in NAMED_ENTITES: s[i][2] = 'kA' s[i][1] = s[i][0] if not s[i][0] in NER_STOPWORDS + NUM_FOLLOW + RECOMMENDATIONS: s[i][0] += '_ACTOR' # komercni banka exception elif i > 0 and s[i-1][1] in ['komerční', 'Komerční'] and s[i][1] == 'banka': if not s[i-1][0].endswith('_ACTOR'): s[i-1][0] += '_ACTOR' s[i][0] = 'banka_ACTOR' # copy tag value s[i-1][2] = s[i][2][:] # upper case first, other lower case elif i > 0 and s[i][0][0].isupper() and len(s[i][0]) > 2 and s[i][0][1].islower() and not s[i][0].lower() in RECOMMENDATIONS: s[i][1] = s[i][0] s[i][2] = 'kA' if not s[i][0] in NER_STOPWORDS + NUM_FOLLOW + RECOMMENDATIONS: s[i][0] += '_ACTOR' # at least two upper case letters elif len(s[i][0]) > 1 and s[i][0].isupper(): s[i][1] = s[i][0] s[i][2] = 'kA' if not s[i][0] in NER_STOPWORDS + NUM_FOLLOW + RECOMMENDATIONS: s[i][0] += '_ACTOR' # first small, next upper elif len(s[i][0]) > 1 and s[i][0][0].islower() and s[i][0][1].isupper(): s[i][1] = s[i][0] s[i][2] = 'kA' s[i][0] += '_ACTOR' # if is first and next token is '-' or ':' elif i == 0 and len(s) > 1 and (s[1][1] in '-:' or s[1][0].startswith('(')) and not s[0][1].lower() in ['akcie']: s[i][1] = s[i][0] s[i][2] = 'kA' s[i][0] += '_ACTOR' # if starts with ( and is upper elif s[i][0].startswith('('): s[i][1] = s[i][0] try: if s[i][1].split('_')[0].isupper(): s[i][0] += '_ACTOR' except: pass # ----- RECOMMENDATIONS ----- # preposition/conjunction/comma + recommendation elif (s[i][1].lower() in RECOMMENDATIONS or s[i][0].lower() in RECOMMENDATIONS or (s[i][0].startswith('\"_') and len(s[i][1]) > 4 and len(s[i][1]) < 20)) and i > 0 and ('k7' in s[i-1][2] or 'k8' in s[i-1][2] or s[i-1][1] in ',;:'): if s[i][0].startswith('\"_'): s[i][0] = s[i][0][2:-2] s[i][1] = s[i][0] s[i][2] = 'kA' s[i][0] += '_STATE' # recommendation prefix + recommendation elif (s[i][1].lower() in RECOMMENDATIONS or s[i][0].lower() in RECOMMENDATIONS or (s[i][0].startswith('\"_') and len(s[i][1]) > 4 and len(s[i][1]) < 20)) and i > 0 and s[i-1][1].lower() in RECOMMENDATION_PREFIX: if s[i][0].startswith('\"_'): s[i][0] = s[i][0][2:-2] s[i][1] = s[i][0] s[i][2] = 'kA' s[i][0] += '_STATE' # 'nákupní', 'prodejní' + doporučení elif i + 1 < len(s) and s[i][1].lower() in ['nákupní', 'prodejní'] and s[i+1][1].lower() == 'doporučení': s[i][1] = s[i][0] s[i][2] = s[i+1][2] s[i][0] += '_STATE' s[i+1][0] += '_STATE' # in recommendation list and last in sentence elif i == len(s) - 2 and (s[i][0] in RECOMMENDATIONS or (s[i][0].startswith('\"_') and len(s[i][1]) > 4 and len(s[i][1]) < 20)): if s[i][0].startswith('\"_'): s[i][0] = s[i][0][2:-2] s[i][1] = s[i][0] s[i][2] = 'kA' s[i][0] += '_STATE' # ----- PRICE ----- elif REAL_NUMBER_PATTERN.match(s[i][0]) and (i == 0 or (i > 0 and not s[i-1][1].lower() in NON_PRICE)): s[i][1] = s[i][0] s[i][2] = 'k4' s[i][0] += '_PRICE' # add to new sentences new_sentences.append(connectTokens(s)) return new_sentences
#!/usr/bin/env python # script for running NER import sys from lib.pipeline_utils import PipelineUtils from lib.ner.ner import executeNER, changePOSTags if len(sys.argv) > 1: # for development purposes on my PC f = open('../data/desamb_out_3', 'r') PipelineUtils.formatDesambOutput(changePOSTags(executeNER(PipelineUtils.bufferSentences(f.readlines())))) else: PipelineUtils.formatDesambOutput(changePOSTags(executeNER(PipelineUtils.bufferSentences(sys.stdin.readlines()))))