def eventextraction_finance_v1(): ''' 火灾事件提取v1.2版服务 ''' json_data = request.get_json() # print(json_data) result = {} # 处理入参 if 'app_key' in json_data: if json_data['app_key'] != 'masweb_demo': result['code'] = settings.CODE_ERROR result['msg'] = settings.MSG_ERROR_PARSE + \ ': app_key is {}.'.format(json_data['app_key']) result['time'] = str(int(time.time())) return jsonify(result) else: result['code'] = settings.CODE_ERROR result['msg'] = settings.MSG_NO_PARSE + ': app_key' result['time'] = str(int(time.time())) return jsonify(result) if 'func' in json_data: for func in json_data['func']: if json_data['func'] not in settings.FUNC_LIST: print(func) result['code'] = settings.CODE_ERROR result['msg'] = settings.MSG_ERROR_PARSE + \ ': {} in func'.format(json_data['func']) result['time'] = str(int(time.time())) return jsonify(result) else: result['code'] = settings.CODE_ERROR result['msg'] = settings.MSG_NO_PARSE + ': func' result['time'] = str(int(time.time())) return jsonify(result) news = json_data['body']['text'] print(type(news)) # 参数检测通过,则调用成功 result['code'] = settings.CODE_SUCCESS result['msg'] = settings.MSG_SUCCESS result['timestamp'] = str(int(time.time())) result['body'] = {} nlp = StanfordNER(news) print(nlp) # 根据func定义返回内容 if 'ner' in json_data['func']: result['body']['ner'] = NER(nlp).ner if 'event' in json_data['func']: event = EventExtraction(news, nlp) result['body']['event_extraction'] = event.event if 'graph' in json_data['func']: result['body']['graph'] = DataToGraph(event).graph return jsonify(result)
def main(argv): if len(argv) < 3: usage(argv) dic = False freq = False own_tag = False if len(argv) >= 4: if argv[3] == "-d": dic = True if len(argv) >= 4: if argv[3] == "-f": freq = True if len(argv) >= 5: if argv[4] == "-f": freq = True if len(argv) >= 5: if argv[4] == "-f": freq = True if len(argv) >= 4: if argv[3] == "-t": own_tag = True if len(argv) >= 5: if argv[4] == "-t": own_tag = True if len(argv) >= 6: if argv[5] == "-t": own_tag = True ex = Util.read_file(argv[1]) ex = Util.transform_text(ex) models = ["data/location.txt", "data/person.txt", "data/organisation.txt"] # Analyse lexicale lexer = Lexer(ex, own_tag) lexer.lex() # Analyse syntaxique parser = Parser(lexer.get_tokenized_text(), own_tag) parser.parse() # Analyse sémantique + reconnaissance des EN ner = NER(ex, parser.get_parsed_text()) if dic: ner.gen_models(models) ner.apply() # Balisage du texte tagger = Tagger(ner.get_ner(), ex) if freq: tagger.freq_tag(argv[2]) else: tagger.tag(argv[2])
def dispmsg(self): name_label2 = ttk.Label(self.window, text = "File with the queried intents is downloaded at " + str(self.name_var.get()), font=('Times New Roman', 10, 'normal')) name_label2.grid(row=10,column=1,padx = 5, pady = 10) if str(self.name_var1.get()) != '': learning = 'active' Data,UserFnames = Read_Files(str(self.name_var.get()), learning=learning, vertical= str(self.vertical.get()).lower()) Data_Frame = pd.DataFrame(Data, columns = ['FileName', 'FilePath', 'Text']) Data_Frame = NER(Data_Frame) kf = [] for ind in Data_Frame.index: text = Data_Frame['Text'][ind] tr4w = TextRank4Keyword() tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False) kf.append(tr4w.get_keywords(100)) Data_Frame['KeyPhrases'] = kf name = str(self.vertical.get()).lower() endpoint = "https://<EndPoint>.search.windows.net" key = "<Cognitive search key>" if name == 'default': create_index(name, endpoint, key) upload_docs(Data_Frame=Data_Frame, index_name= name, endpoint=endpoint, key=key) result = search(rootdir=str(self.name_var.get()), Query=str(self.name_var1.get()), index_name=name, endpoint=endpoint, key= key, fnames = UserFnames, vertical=str(self.vertical.get()).lower()) if name == 'default': from azure.search.documents.indexes import SearchIndexClient from azure.core.credentials import AzureKeyCredential client = SearchIndexClient(endpoint, AzureKeyCredential(key)) client.delete_index(name) elif str(self.name_var1.get()) == '' and str(self.classes.get()) != 'None': learning = 'passive' Data,UserFnames = Read_Files(str(self.name_var.get()), learning=learning, vertical= None) Data_Frame = pd.DataFrame(Data, columns = ['FileName', 'FilePath', 'Text']) result = classifier(dataframe=Data_Frame, classs=str(self.classes.get()), rootdir=str(self.name_var.get())) else: pass
def analyze(self): logging.info('*******************************************************') result_dict = {} result_dict['source'] = self.source.strip().lower() result_dict['q_type'] = self.s_type.strip().lower() res = model.predict(sentence=self.sentence) root_dict = res['hierplane_tree']['root'] logging.info('sentence {} parsed as {}'.format(self.sentence, root_dict)) emb = elmo(batch_to_ids([ self.sentence.split() ]))['elmo_representations'][0].detach().numpy() parse_tree = ParseTree(root_dict, self.sentence) # logging.info('ParseTree type is: {}'.format(parse_tree.get_type())) # parse_tree.iterate() logging.info( 'Now it\'s time to check the string representation \n{}'.format( str(parse_tree.root))) # parse_tree.analyze() logging.info('extracting information') all_nodes = set() all_intent_nodes = set() all_desc_nodes = set() toponyms = NER.extract_place_names(self.sentence) result_dict['pnames'] = toponyms topo_nodes = set() for t in toponyms: logging.info('\ttoponym:\t{}'.format(t)) nodes = parse_tree.find(t) if nodes is None: logging.info('An error in finding nodes') else: for n in nodes: n.role = 'n' topo_nodes.add(n) for t_node in topo_nodes: logging.info('\t**Found Node: {} and index {}'.format( t_node.word, t_node.index)) all_nodes = all_nodes.union(topo_nodes) all_desc_nodes = all_desc_nodes.union(topo_nodes) dates = NER.extract_dates(self.sentence) result_dict['dates'] = dates dates_nodes = set() for d in dates: logging.info('\tdate:\t{}'.format(d)) nodes = parse_tree.find(d) if nodes is None: logging.info('An error in finding nodes') else: for n in nodes: n.role = 'd' dates_nodes.add(n) for d_node in dates_nodes: logging.info('\t**Found Node: {} and index {}'.format( d_node.word, d_node.index)) all_nodes = all_nodes.union(dates_nodes) all_desc_nodes = all_desc_nodes.union(dates_nodes) whs_nodes = parse_tree.get_intent() whs = [] for wh_node in whs_nodes: wh_node.role = intent_encoding(wh_node, PRONOUN) whs.append(wh_node.word) for w in whs: logging.info('intent is: {}'.format(w)) all_nodes = all_nodes.union(whs_nodes) all_intent_nodes = all_intent_nodes.union(whs_nodes) result_dict['intents'] = whs a_entities_set = set() a_entities_nodes = set() a_types = [] a_types_nodes = set() for whs_node in whs_nodes: wh_nouns = whs_node.iterate_nouns() wh_nouns.sort(key=sort_function, reverse=True) for n in wh_nouns: if not is_inside(n.word, toponyms) and not is_inside( n.word, dates) and not is_left_inside( n.word, a_types) and is_a_new_one( a_types_nodes, n): if is_left_inside( n.word.lower().strip(), pt_set) or is_left_inside( n.word.lower().strip(), pt_dict.keys()): a_types.append(n.word) n.role = 't' a_types_nodes.add(n) elif ' ' not in n.word.strip() and len(n.word) > 2: a_entities_set.add(n.word) n.role = 'o' a_entities_nodes.add(n) for t in a_types: logging.info('\ttype in intent:\t{}'.format(t)) a_entities = list(a_entities_set) for e in a_entities: logging.info('\tentity in intent:\t{}'.format(e)) all_nodes = all_nodes.union(a_types_nodes) all_intent_nodes = all_intent_nodes.union(a_types_nodes) all_nodes = all_nodes.union(a_entities_nodes) all_intent_nodes = all_intent_nodes.union(a_entities_nodes) result_dict['i_objects'] = a_entities result_dict['i_ptypes'] = a_types nouns = parse_tree.get_nouns() nouns.sort(key=sort_function, reverse=True) types = [] types_nodes = set() entities_set = set() entities_nodes = set() for n in nouns: if not is_inside(n.word, toponyms) and not is_inside( n.word, dates) and not is_inside( n.word, whs) and not is_left_inside( n.word, types) and is_a_new_one(types_nodes, n): if is_left_inside(n.word.lower().strip(), pt_set) or is_left_inside( n.word.lower().strip(), pt_dict.keys()): types.append(n.word) n.role = 't' types_nodes.add(n) elif ' ' not in n.word.strip() and len(n.word) > 2: entities_set.add(n.word) n.role = 'o' entities_nodes.add(n) for t in types: logging.info('\ttype:\t{}'.format(t)) entities = list(entities_set) for e in entities: logging.info('\tentity:\t{}'.format(e)) all_nodes = all_nodes.union(types_nodes) all_desc_nodes = all_desc_nodes.union(types_nodes) all_nodes = all_nodes.union(entities_nodes) all_desc_nodes = all_desc_nodes.union(entities_nodes) result_dict['objects'] = entities result_dict['ptypes'] = types verbs = parse_tree.get_verbs() situations = [] situations_nodes = set() activities = [] activities_nodes = set() unknowns = [] unknowns_nodes = set() for v in verbs: v_index = self.sentence.split().index(v.word) v_emb = [emb[0][v_index]] logging.debug('verb is {} and len of emb is {}'.format( v.word, len(v_emb))) decision = verb_encoding(v_emb, actv_emb, stav_emb) if decision == "a": activities.append(v.word) v.role = 'a' activities_nodes.add(v) elif decision == "s": situations.append(v.word) v.role = 's' situations_nodes.add(v) else: unknowns.append(v.word) unknowns_nodes.add(v) for s in situations: logging.info('\tsituation: {}'.format(s)) for a in activities: logging.info('\tactivities: {}'.format(a)) for u in unknowns: logging.info('\tunknown: {}'.format(u)) all_nodes = all_nodes.union(activities_nodes) all_desc_nodes = all_desc_nodes.union(activities_nodes) all_nodes = all_nodes.union(situations_nodes) all_desc_nodes = all_desc_nodes.union(situations_nodes) result_dict['situations'] = situations result_dict['activities'] = activities pps = parse_tree.get_pps() relations = [] relation_nodes = set() for pp in pps: for n in toponyms: if 'with' in pp.word.lower(): is_within = is_within_phrase(pp.word) if is_within is not None: in_pp = pp.get_in_in_pp() if in_pp is not None: relations.append(in_pp.word) in_pp.role = 'r' relation_nodes.add(in_pp) if n in pp.word and not is_inside_right( pp.word, entities) and not is_inside_right( pp.word, a_entities): in_pp = pp.get_in_in_pp() if in_pp is not None: relations.append(in_pp.word) in_pp.role = 'r' relation_nodes.add(in_pp) break for t in types: if t in pp.word: in_pp = pp.get_in_in_pp() if in_pp is not None: relations.append(in_pp.word) in_pp.role = 'r' relation_nodes.add(in_pp) break all_nodes = all_nodes.union(relation_nodes) all_desc_nodes = all_desc_nodes.union(relation_nodes) for relation in relations: logging.info('\trelation: {}'.format(relation)) result_dict['relations'] = relations adjs = parse_tree.get_adjectives() qualities = [] qualities_nodes = set() object_qualities = [] object_qualities_nodes = set() for adj in adjs: siblings = adj.get_siblings() for sibling in siblings: if is_inside(sibling.word, toponyms) or is_inside( sibling.word, types) or is_inside( sibling.word, a_types): if not is_inside(adj.word, types) and not is_inside( adj.word, a_types): qualities.append(adj.word) adj.role = 'q' qualities_nodes.add(adj) break elif is_inside(sibling.word, entities) or is_inside( sibling.word, a_entities): object_qualities.append(adj.word) adj.role = 'p' object_qualities_nodes.add(adj) break all_nodes = all_nodes.union(qualities_nodes) all_desc_nodes = all_desc_nodes.union(qualities_nodes) all_nodes = all_nodes.union(object_qualities_nodes) all_desc_nodes = all_desc_nodes.union(object_qualities_nodes) for q in qualities: logging.info('\tquality: {}'.format(q)) for oq in object_qualities: logging.info('\tobject quality: {}'.format(oq)) result_dict['pqualities'] = qualities result_dict['oqualities'] = object_qualities # coding schema: where: 1, what: 2, which: 3, why: 4, how: 5, how+adj: 6 etc. make it complete... other:0... # ...activity: a, situation: s, quality: q, object_quality: p, relation: r, toponym: n, type: t, date: d ignored_nodes = [] leaves = parse_tree.get_leaves() for leaf in leaves: if leaf.is_unknown(): ignored_nodes.append(leaf) temp = [] for leaf in ignored_nodes: for n in all_nodes: flag = True if n.is_fuzzy_matched: if leaf.word in n.word: flag = False break else: if n.is_your_child(leaf): flag = False break if flag: temp.append(leaf) all_nodes.add(leaf) # ignored_nodes = temp all_list = list(all_nodes) intent_list = list(all_intent_nodes) description_list = list(all_desc_nodes) all_list.sort(key=lambda x: x.index, reverse=False) intent_list.sort(key=lambda x: x.index, reverse=False) description_list.sort(key=lambda x: x.index, reverse=False) intent_code = '' intent_info = [] for node in intent_list: intent_code += node.role if node.is_fuzzy_matched: intent_info.append({ 'tag': node.role, 'value': node.fuzzy_word }) else: intent_info.append({'tag': node.role, 'value': node.word}) desc_code = '' desc_info = [] for node in description_list: desc_code += node.role if node.is_fuzzy_matched: desc_info.append({'tag': node.role, 'value': node.fuzzy_word}) else: desc_info.append({'tag': node.role, 'value': node.word}) if Sentence.is_ambiguous(intent_list, intent_code): logging.info( 'the intention is ambiguous, code: {}'.format(intent_code)) resolved = Sentence.resolving_intent(desc_info) result_dict['resolved_intent'] = resolved if resolved['code'] != '': intent_code += resolved['code'] intent_info.extend(resolved['list']) desc_temp_list = [] for d in desc_info: if d not in resolved['list']: desc_temp_list.append(d) else: logging.debug('found!') desc_code = desc_code.replace(resolved['code'], '', 1) desc_info = desc_temp_list logging.debug('updated...') result_dict['intent_code'] = intent_code result_dict['intent_info'] = intent_info result_dict['desc_code'] = desc_code result_dict['desc_info'] = desc_info all_code = '' all_info = [] for node in all_list: all_code += node.role if node.is_fuzzy_matched: all_info.append({'tag': node.role, 'value': node.fuzzy_word}) else: all_info.append({'tag': node.role, 'value': node.word}) result_dict['all_code'] = all_code result_dict['all_info'] = all_info logging.info('\tintent code is: {}'.format(intent_code)) logging.info('\tdesc code is: {}'.format(desc_code)) logging.info('\tall code is: {}'.format(all_code)) logging.info('*******************************************************') return result_dict
from ner import NER from flask import Flask from flask import request from flask import jsonify import os import json app = Flask(__name__) ner_model=NER(os.environ['SPACY_MODEL']) @app.route('/') def hello(): return "Hello World!" @app.route("/ner", methods=["GET","POST"]) def ner_request(): if request.method == "POST": req = request.get_json() if(req["version"]==1): entity_mentions=ner_model.spacy_ner(req["content"]) return jsonify({"entities": entity_mentions}) else: return "The current version is not supported." elif request.method == "GET": return "Hello World from NER!" if __name__ == '__main__': app.run(host="0.0.0.0") #JSON
def lambda_handler(event, context): recognized_reviews = NER(event, False).recognize_words() # recognize the words final_json = Sentiment(recognized_reviews, 0.5,0.5, 0.2).calculate_final_json() # return the top words return final_json
import utils import time import re import os from ner import NER from email_sender import send_mail extractor = NER() pd = utils.pd import run_api xls = pd.ExcelFile('strings stems.xlsx') first = pd.read_excel(xls, 'first').dropna(axis=1, how='all').dropna(axis=0, how='all') parent_second = pd.read_excel(xls, 'parent second').dropna( axis=1, how='all').dropna(axis=0, how='all') tutoring = pd.read_excel(xls, 'Tutoring').dropna(axis=1, how='all').dropna(axis=0, how='all') bad_keywords = pd.read_excel(xls, 'bad keywords').dropna( axis=1, how='all').dropna(axis=0, how='all') at_least_another = pd.read_excel(xls, 'at least another').dropna( axis=1, how='all').dropna(axis=0, how='all') payments = pd.read_excel(xls, 'payments').dropna(axis=1, how='all').dropna(axis=0, how='all') contract = pd.read_excel(xls, 'contract').dropna(axis=1, how='all').dropna(axis=0, how='all')
def namedEntityRecognition(self): ne = NER(self.original_query) self.entities = ne.performNER() self.named_entities = [ne.lower() for ne in self.entities]
def find_events(question): return NER.extract_events(question)
def find_toponyms(question): return NER.extract_place_names(question)
def find_dates(question): return NER.extract_dates(question)
def read(self, textFile, mentionsFile): """Read files containing the text and mentions, returning an object of them having 'text' and 'cluster list' as attributes""" ls = [] text = "" mentions = "" tuples = [] mentionClustersList = [] clusterCount = 0 mentionToClusterMap = {} with open(self.__path + textFile) as f_text: text = f_text.read() temp = text.splitlines() if len(temp[-1]) == 0: temp.pop() text = " ".join(temp) self.__parser.process(text) dependenciesList = self.__parser.getDependencies() print "Index Map to be used when creating mentions file:" for i, j in enumerate(text): print i, j raw_input("\nPlease enter the indices of the mentions in the mentions file: <Press enter to continue process>") with open(self.__path + mentionsFile) as f_mention: mentions = f_mention.read() ls = mentions.splitlines() if len(ls[-1]) == 0: ls.pop() for line in ls: line = line.split() new_tuple = int(line[0]), int(line[1]) tuples.append(new_tuple) for element in tuples: left = element[0] right = element[1] mentions = text[left:right] mentionClustersList.append(Cluster(mentions, element)) mentionToClusterMap[element] = mentionClustersList[-1] clusterCount = clusterCount + 1 n = NER() print"\nThis may take some time. Please wait...\n" n.process(self.__path + textFile) NERMap = n.getNERRelations() NESet = set(NERMap.keys()) feature = Features(dependenciesList) self.__document.setText(text) self.__document.setTextPath(self.__path + textFile) self.__document.setMentionClustersList(mentionClustersList) self.__document.setFeatures(feature.getAppositiveRelations(), feature.getCopulativeRelations()) self.__document.setMentionToClusterMap(mentionToClusterMap) self.__document.setParserObject(self.__parser) self.__document.setNERMap(NERMap) self.__document.setNESet(NESet) return self.__document
from config import FoundationTrilogy from preprocess import preprocess from ner import NER from entity_connections import LINK_ENTITIES from normalization import normalize_list if __name__ == '__main__': text = FoundationTrilogy parsed_list = preprocess(text) # vector of preprocessed sentences predicted = NER(parsed_list) people_links, location_links, events = LINK_ENTITIES(parsed_list, predicted) people_links = normalize_list(people_links)
class Classification: nlp = None ner = None topicModel = None abreviacoesList = [['próx', 'próximo'], ['próx.', 'próximo'], ['prox', 'próximo'], ['prox.', 'próximo'], ['px', 'próximo'], ['px.', 'próximo'], ['av', 'Avenida'], ['av.', 'Avenida'], ['pça', 'Praça'], ['sent', 'sentido'], ['sent.', 'sentido'], ['dª', 'Dona'], ['dª.', 'Dona'], ['d.ª', 'Dona'], ['sta', 'Santa'], ['vdt', 'Viaduto'], ['vdt.', 'Viaduto'], ['c\\', 'com'], ['p\\', 'para'], ['nº', 'número'], ['ref', 'referência'], ['ref.', 'referência'], ['elv', 'Elevado'], ['sra', 'Senhora'], ['gde', 'Grande'], ['prof', 'Professor'], ['prof.', 'Professor'], ['vtr', 'viatura'], ['vtr.', 'viatura'], ['r.', 'Rua']] def __init__(self, gazetteer, datasetFile, annotatedEntities, vocabularyFile): self.nlp = NLP() self.ner = NER(gazetteer, annotatedEntities) self.topicModel = TopicClassification(datasetFile, vocabularyFile) def preprocessing(self, sentence): newSentence = sentence newSentence = re.sub('R\.', 'Rua', newSentence) #Remover termos ['RT', 'Km\h', 'km' , 'mm', 13h40, 30 min] newSentence = re.sub('\d+\s*km/h', ' ', newSentence, flags=re.I) newSentence = re.sub('\d+\s*km', ' ', newSentence, flags=re.I) newSentence = re.sub('\d+h\d+', ' ', newSentence, flags=re.I) newSentence = re.sub('\d+h ', ' ', newSentence, flags=re.I) newSentence = re.sub('\d+hrs ', ' ', newSentence, flags=re.I) newSentence = re.sub('\d+\s*mm', ' ', newSentence) newSentence = re.sub('\s*RT ', ' ', newSentence) newSentence = re.sub('\d+\s*min\s', ' ', newSentence) newSentence = re.sub(r'\s(\w+)…', ' ', newSentence) #BR 040 para BR040 p = re.compile('BR\s*\d+') lista = p.findall(newSentence) for item in lista: newSentence = newSentence.replace(item, item.replace(' ', '')) #Remover Urls newSentence = re.sub( r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', newSentence) #Remover hashtags e mençoes de usuários @user newSentence = re.sub(r"(?:\@|https?\://)\S+", " ", newSentence) newSentence = re.sub(r"(#.*?)\s", " ", newSentence) #Remover caracteres especiais p = re.compile('(ª|º)') newSentence = p.sub(' ', newSentence) newSentence = re.sub('\W', ' ', newSentence) #Remover numeros newSentence = re.sub(' \d+', ' ', newSentence).lstrip() #Remover pontuação for pontuacao in string.punctuation: newSentence = newSentence.replace(pontuacao, ' ') #Expandir abreviações wordsList = newSentence.lower().split(" ") for word in self.abreviacoesList: if (word[0] in wordsList): newSentence = re.sub(word[0], word[1], newSentence, flags=re.I) #Removendo espaços extras newSentence = re.sub(' +', ' ', newSentence) return newSentence def classify(self, sentence): newSentence = self.preprocessing(sentence) sentenceTokens = self.nlp.tokenization(newSentence) results = self.ner.dictionaryNER(sentenceTokens) labelIOB = results[0] coordinate = results[1] topic = self.topicModel.predictTopic(newSentence) labelIOBnew = self.mergeResults(sentence, newSentence, labelIOB) return ([labelIOB, labelIOBnew, coordinate, topic]) def mergeResults(self, sentence, preprocessedSentence, labelIOB): #print("\n" + sentence) #print(preprocessedSentence) #sentenceTokens = self.nlp.tokenization(sentence) newSentenceTokens = sentence.split(" ") preprocessedSentenceTokens = self.nlp.tokenization( preprocessedSentence) abreviacoes = list(map(operator.itemgetter(0), self.abreviacoesList)) #newSentenceTokens = sentenceTokens #~ for idx, token in enumerate(sentenceTokens): #~ if (token == '.' and idx-1 > 0 and sentenceTokens[idx-1].lower() == 'av'): #~ newSentenceTokens.pop(idx) #~ newSentenceTokens[idx-1] = newSentenceTokens[idx-1]+'.' index = 0 newLabelIOB = "" skipLoop = False for idx, item in enumerate(newSentenceTokens): if (skipLoop): skipLoop = False continue newItem = item for pontuacao in string.punctuation: newItem = newItem.replace(pontuacao, '') #print (newItem) if (index < len(preprocessedSentenceTokens) and item not in string.punctuation): #print (newItem + " vs " + preprocessedSentenceTokens[index]) if (newItem == preprocessedSentenceTokens[index] or item.lower() in abreviacoes): #print("<>> " + labelIOB[index]) newLabelIOB = newLabelIOB + labelIOB[index] index += 1 #palavras compostas elif (index + 1 < len(preprocessedSentenceTokens) and item.find(preprocessedSentenceTokens[index]) != -1 and item.find(preprocessedSentenceTokens[index + 1]) != -1): #print (">>>>>" + labelIOB[index] + " vs "+ labelIOB[index+1] + " = " + self.ner.andOperationIOB(labelIOB[index], labelIOB[index+1])) newLabelIOB = newLabelIOB + self.ner.andOperationIOB( labelIOB[index], labelIOB[index + 1]) index += 2 #BR 262 e BR262 elif (idx + 1 < len(newSentenceTokens) and preprocessedSentenceTokens[index].find(item) != -1 and preprocessedSentenceTokens[index].find( newSentenceTokens[idx + 1]) != -1): #print (">>>>>" + labelIOB[index] + " = " + labelIOB[index]) if (labelIOB[index] == 'B'): newLabelIOB = newLabelIOB + labelIOB[index] + 'I' else: newLabelIOB = newLabelIOB + labelIOB[index] + labelIOB[ index] index += 1 skipLoop = True else: #print('O') newLabelIOB = newLabelIOB + 'O' else: #print('O') newLabelIOB = newLabelIOB + 'O' if (len(newSentenceTokens) != len(newLabelIOB)): print("::::ERROO::: Size senten:: " + str(len(newSentenceTokens)) + " / size label:: " + str(len(newLabelIOB))) newLabelIOB = self.excessoes(newLabelIOB, newSentenceTokens) #print (self.extractAnnotatedEntities(labelIOB, preprocessedSentenceTokens)) #print (self.extractAnnotatedEntities(newLabelIOB, newSentenceTokens)) #print (preprocessedSentenceTokens) #print (newSentenceTokens) return newLabelIOB def excessoes(self, newLabelIOB, sentenceTokens): lastB = 0 for idx, token in enumerate(sentenceTokens): if (newLabelIOB[idx] == 'B'): lastB = idx #Sequencia tipo BOI devido à pontuacao if (idx + 1 < len(newLabelIOB) and token in string.punctuation and (idx - lastB) <= 2 and newLabelIOB[idx + 1] == "I"): newLabelIOB = newLabelIOB[0:idx] + "I" + newLabelIOB[ idx + 1:len(newLabelIOB)] #Sequencia BOOI devido a termos removidos do pre-processamento elif (newLabelIOB[idx] == "I" and (idx - lastB) > 2 and newLabelIOB[idx - 1] == 'O'): newLabelIOB = newLabelIOB[0:lastB] + "O" + newLabelIOB[ lastB + 1:len(newLabelIOB)] newLabelIOB = newLabelIOB[0:idx] + "O" + newLabelIOB[ idx + 1:len(newLabelIOB)] return newLabelIOB def extractAnnotatedEntities(self, patternLabel, sentenceTokens): occurrences = re.findall('(BI*)', patternLabel) entities = [] newPattern = patternLabel indices = [] for indx, occurrence in enumerate(occurrences): indexStart = newPattern.find(occurrences[indx]) indices.append([indexStart, indexStart + len(occurrences[indx])]) subs = "" for i in range(len(occurrences[indx])): subs = subs + 'X' newPattern = newPattern.replace(occurrences[indx], subs, 1) termo = [] for i, idx in enumerate(indices): for position in range(idx[0], idx[1]): termo.append(sentenceTokens[position]) entities.append(" ".join(termo).upper()) termo = [] return entities def teste(self): print( self.classify( '20h37 (+) / R. Pitangui / R. Alabastro / Av. Silviano Brandão / Av. Flávio dos Santos.' )) #return print(self.classify('@g1 era porcelanato pelo menos?')) print( self.classify( 'RT @g1: Luta contra leucemia vai exigir que aluna faça #Enem2016 no hospital https://t.co/aZV9zIvp1l #G1 https://t.co/WDwwzbk5h4' )) print( self.classify( 'Operação especial na rodoviária (TERGIP), de 5/2 a 13/2, para o feriado do Carnaval 2016. https://t.co/vVIl36tG6A' )) print( self.classify( 'RT @defesacivilbh: 15h46 - Risco de chuva (20 a 40 mm), raios e ventos (até 50 km/h). Até 23h59 de terça (16). @Bombeiros_MG #BH https://t.…' )) print( self.classify( 'RT @Bombeiros_MG: 21h - Árvore de grande porte caída na BR 262 (Anel Rod), px à PUC São Gabriel, pista obstruída. Risco de colisões. 1 vtr …' )) print( self.classify( '@diih__campos Boa Tarde! O Quadro de Horários de segunda-feira corresponde ao de dia atípico e terça ao de feriado.' )) print( self.classify( 'RT @defesacivilbh: Acréscimo de 20 a 30mm do alerta emitido totalizando 70mm até 7h de terça (24) raios e rajadas vento de até 50 km/h. htt…' )) print( self.classify( 'Criação da Linha 825 (Estação São Gabriel / Vitória II via UPA Nordeste) a partir de domingo, dia 21/2. Confira: https://t.co/PV2OQkx10H' )) print( self.classify( 'RT @PRF191MG: 10h30 - RETIFICAÇÃO: BR040 negociação ficou decidido pistas principais ficarão liberadas por 30 min e depois serão fechadas …' )) print( self.classify( 'Participe da 5ª Reunião do Observatório da Mobilidade Urbana de BH! Inscrições pelo link https://t.co/bMsvjwaLZZ' )) print( self.classify( '@dannymendes10 Boa Noite! Nossa equipe esteve no local e constatou a presença da CEMIG. Local sem energia elétrica.' ))
def __init__(self, gazetteer, datasetFile, annotatedEntities, vocabularyFile): self.nlp = NLP() self.ner = NER(gazetteer, annotatedEntities) self.topicModel = TopicClassification(datasetFile, vocabularyFile)