def eventextraction_finance_v1():
    ''' 火灾事件提取v1.2版服务
    '''
    json_data = request.get_json()
    # print(json_data)
    result = {}

    # 处理入参
    if 'app_key' in json_data:
        if json_data['app_key'] != 'masweb_demo':
            result['code'] = settings.CODE_ERROR
            result['msg'] = settings.MSG_ERROR_PARSE + \
                            ': app_key is {}.'.format(json_data['app_key'])
            result['time'] = str(int(time.time()))
            return jsonify(result)
    else:
        result['code'] = settings.CODE_ERROR
        result['msg'] = settings.MSG_NO_PARSE + ': app_key'
        result['time'] = str(int(time.time()))
        return jsonify(result)

    if 'func' in json_data:
        for func in json_data['func']:
            if json_data['func'] not in settings.FUNC_LIST:
                print(func)
                result['code'] = settings.CODE_ERROR
                result['msg'] = settings.MSG_ERROR_PARSE + \
                                ': {} in func'.format(json_data['func'])
                result['time'] = str(int(time.time()))
                return jsonify(result)
    else:
        result['code'] = settings.CODE_ERROR
        result['msg'] = settings.MSG_NO_PARSE + ': func'
        result['time'] = str(int(time.time()))
        return jsonify(result)

    news = json_data['body']['text']
    print(type(news))
    # 参数检测通过,则调用成功
    result['code'] = settings.CODE_SUCCESS
    result['msg'] = settings.MSG_SUCCESS
    result['timestamp'] = str(int(time.time()))

    result['body'] = {}

    nlp = StanfordNER(news)
    print(nlp)
    # 根据func定义返回内容
    if 'ner' in json_data['func']:
        result['body']['ner'] = NER(nlp).ner

    if 'event' in json_data['func']:
        event = EventExtraction(news, nlp)
        result['body']['event_extraction'] = event.event
        if 'graph' in json_data['func']:
            result['body']['graph'] = DataToGraph(event).graph

    return jsonify(result)
Beispiel #2
0
def main(argv):
    if len(argv) < 3:
        usage(argv)

    dic = False
    freq = False
    own_tag = False
    if len(argv) >= 4:
        if argv[3] == "-d":
            dic = True

    if len(argv) >= 4:
        if argv[3] == "-f":
            freq = True

    if len(argv) >= 5:
        if argv[4] == "-f":
            freq = True

    if len(argv) >= 5:
        if argv[4] == "-f":
            freq = True

    if len(argv) >= 4:
        if argv[3] == "-t":
            own_tag = True

    if len(argv) >= 5:
        if argv[4] == "-t":
            own_tag = True

    if len(argv) >= 6:
        if argv[5] == "-t":
            own_tag = True

    ex = Util.read_file(argv[1])
    ex = Util.transform_text(ex)
    models = ["data/location.txt", "data/person.txt", "data/organisation.txt"]

    # Analyse lexicale
    lexer = Lexer(ex, own_tag)
    lexer.lex()

    # Analyse syntaxique
    parser = Parser(lexer.get_tokenized_text(), own_tag)
    parser.parse()

    # Analyse sémantique + reconnaissance des EN
    ner = NER(ex, parser.get_parsed_text())
    if dic: ner.gen_models(models)
    ner.apply()

    # Balisage du texte
    tagger = Tagger(ner.get_ner(), ex)
    if freq:
        tagger.freq_tag(argv[2])
    else:
        tagger.tag(argv[2])
Beispiel #3
0
    def dispmsg(self):
        name_label2 = ttk.Label(self.window, 
                                text = "File with the queried intents is downloaded at " + str(self.name_var.get()), 
                                font=('Times New Roman', 10, 'normal'))
        name_label2.grid(row=10,column=1,padx = 5, pady = 10)

        if str(self.name_var1.get()) != '':
            learning = 'active'
            Data,UserFnames =  Read_Files(str(self.name_var.get()), learning=learning, vertical= str(self.vertical.get()).lower())
            Data_Frame = pd.DataFrame(Data, columns = ['FileName', 'FilePath', 'Text'])
            Data_Frame = NER(Data_Frame)
            kf = []
            for ind in Data_Frame.index:
                text = Data_Frame['Text'][ind]
                tr4w = TextRank4Keyword()
                tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
                kf.append(tr4w.get_keywords(100))
            Data_Frame['KeyPhrases'] = kf
            name = str(self.vertical.get()).lower()
            endpoint = "https://<EndPoint>.search.windows.net"
            key = "<Cognitive search key>"
            if name == 'default':
                create_index(name, endpoint, key)
            upload_docs(Data_Frame=Data_Frame, index_name= name, endpoint=endpoint, key=key)
            result = search(rootdir=str(self.name_var.get()), 
                            Query=str(self.name_var1.get()), index_name=name, 
                            endpoint=endpoint, key= key, fnames = UserFnames, 
                            vertical=str(self.vertical.get()).lower())
            if name == 'default':
                from azure.search.documents.indexes import SearchIndexClient
                from azure.core.credentials import AzureKeyCredential
                client = SearchIndexClient(endpoint, AzureKeyCredential(key))
                client.delete_index(name)
        elif str(self.name_var1.get()) == '' and str(self.classes.get()) != 'None':
            learning = 'passive'
            Data,UserFnames =  Read_Files(str(self.name_var.get()), learning=learning, vertical= None)
            Data_Frame  =  pd.DataFrame(Data, columns = ['FileName', 'FilePath', 'Text'])
            result = classifier(dataframe=Data_Frame, classs=str(self.classes.get()), rootdir=str(self.name_var.get()))
        else:
            pass
    def analyze(self):
        logging.info('*******************************************************')
        result_dict = {}
        result_dict['source'] = self.source.strip().lower()
        result_dict['q_type'] = self.s_type.strip().lower()
        res = model.predict(sentence=self.sentence)
        root_dict = res['hierplane_tree']['root']
        logging.info('sentence {} parsed as {}'.format(self.sentence,
                                                       root_dict))

        emb = elmo(batch_to_ids([
            self.sentence.split()
        ]))['elmo_representations'][0].detach().numpy()

        parse_tree = ParseTree(root_dict, self.sentence)
        # logging.info('ParseTree type is: {}'.format(parse_tree.get_type()))
        # parse_tree.iterate()
        logging.info(
            'Now it\'s time to check the string representation \n{}'.format(
                str(parse_tree.root)))
        # parse_tree.analyze()
        logging.info('extracting information')
        all_nodes = set()
        all_intent_nodes = set()
        all_desc_nodes = set()
        toponyms = NER.extract_place_names(self.sentence)
        result_dict['pnames'] = toponyms
        topo_nodes = set()
        for t in toponyms:
            logging.info('\ttoponym:\t{}'.format(t))
            nodes = parse_tree.find(t)
            if nodes is None:
                logging.info('An error in finding nodes')
            else:
                for n in nodes:
                    n.role = 'n'
                    topo_nodes.add(n)
        for t_node in topo_nodes:
            logging.info('\t**Found Node: {} and index {}'.format(
                t_node.word, t_node.index))
        all_nodes = all_nodes.union(topo_nodes)
        all_desc_nodes = all_desc_nodes.union(topo_nodes)

        dates = NER.extract_dates(self.sentence)
        result_dict['dates'] = dates
        dates_nodes = set()
        for d in dates:
            logging.info('\tdate:\t{}'.format(d))
            nodes = parse_tree.find(d)
            if nodes is None:
                logging.info('An error in finding nodes')
            else:
                for n in nodes:
                    n.role = 'd'
                    dates_nodes.add(n)

        for d_node in dates_nodes:
            logging.info('\t**Found Node: {} and index {}'.format(
                d_node.word, d_node.index))
        all_nodes = all_nodes.union(dates_nodes)
        all_desc_nodes = all_desc_nodes.union(dates_nodes)

        whs_nodes = parse_tree.get_intent()
        whs = []
        for wh_node in whs_nodes:
            wh_node.role = intent_encoding(wh_node, PRONOUN)
            whs.append(wh_node.word)
        for w in whs:
            logging.info('intent is: {}'.format(w))
        all_nodes = all_nodes.union(whs_nodes)
        all_intent_nodes = all_intent_nodes.union(whs_nodes)
        result_dict['intents'] = whs
        a_entities_set = set()
        a_entities_nodes = set()
        a_types = []
        a_types_nodes = set()
        for whs_node in whs_nodes:
            wh_nouns = whs_node.iterate_nouns()
            wh_nouns.sort(key=sort_function, reverse=True)
            for n in wh_nouns:
                if not is_inside(n.word, toponyms) and not is_inside(
                        n.word, dates) and not is_left_inside(
                            n.word, a_types) and is_a_new_one(
                                a_types_nodes, n):
                    if is_left_inside(
                            n.word.lower().strip(), pt_set) or is_left_inside(
                                n.word.lower().strip(), pt_dict.keys()):
                        a_types.append(n.word)
                        n.role = 't'
                        a_types_nodes.add(n)
                    elif ' ' not in n.word.strip() and len(n.word) > 2:
                        a_entities_set.add(n.word)
                        n.role = 'o'
                        a_entities_nodes.add(n)
        for t in a_types:
            logging.info('\ttype in intent:\t{}'.format(t))
        a_entities = list(a_entities_set)
        for e in a_entities:
            logging.info('\tentity in intent:\t{}'.format(e))
        all_nodes = all_nodes.union(a_types_nodes)
        all_intent_nodes = all_intent_nodes.union(a_types_nodes)
        all_nodes = all_nodes.union(a_entities_nodes)
        all_intent_nodes = all_intent_nodes.union(a_entities_nodes)
        result_dict['i_objects'] = a_entities
        result_dict['i_ptypes'] = a_types
        nouns = parse_tree.get_nouns()
        nouns.sort(key=sort_function, reverse=True)
        types = []
        types_nodes = set()
        entities_set = set()
        entities_nodes = set()
        for n in nouns:
            if not is_inside(n.word, toponyms) and not is_inside(
                    n.word, dates) and not is_inside(
                        n.word, whs) and not is_left_inside(
                            n.word, types) and is_a_new_one(types_nodes, n):
                if is_left_inside(n.word.lower().strip(),
                                  pt_set) or is_left_inside(
                                      n.word.lower().strip(), pt_dict.keys()):
                    types.append(n.word)
                    n.role = 't'
                    types_nodes.add(n)
                elif ' ' not in n.word.strip() and len(n.word) > 2:
                    entities_set.add(n.word)
                    n.role = 'o'
                    entities_nodes.add(n)
        for t in types:
            logging.info('\ttype:\t{}'.format(t))
        entities = list(entities_set)
        for e in entities:
            logging.info('\tentity:\t{}'.format(e))
        all_nodes = all_nodes.union(types_nodes)
        all_desc_nodes = all_desc_nodes.union(types_nodes)
        all_nodes = all_nodes.union(entities_nodes)
        all_desc_nodes = all_desc_nodes.union(entities_nodes)
        result_dict['objects'] = entities
        result_dict['ptypes'] = types
        verbs = parse_tree.get_verbs()
        situations = []
        situations_nodes = set()
        activities = []
        activities_nodes = set()
        unknowns = []
        unknowns_nodes = set()
        for v in verbs:
            v_index = self.sentence.split().index(v.word)
            v_emb = [emb[0][v_index]]
            logging.debug('verb is {} and len of emb is {}'.format(
                v.word, len(v_emb)))
            decision = verb_encoding(v_emb, actv_emb, stav_emb)
            if decision == "a":
                activities.append(v.word)
                v.role = 'a'
                activities_nodes.add(v)
            elif decision == "s":
                situations.append(v.word)
                v.role = 's'
                situations_nodes.add(v)
            else:
                unknowns.append(v.word)
                unknowns_nodes.add(v)
        for s in situations:
            logging.info('\tsituation: {}'.format(s))
        for a in activities:
            logging.info('\tactivities: {}'.format(a))
        for u in unknowns:
            logging.info('\tunknown: {}'.format(u))
        all_nodes = all_nodes.union(activities_nodes)
        all_desc_nodes = all_desc_nodes.union(activities_nodes)
        all_nodes = all_nodes.union(situations_nodes)
        all_desc_nodes = all_desc_nodes.union(situations_nodes)
        result_dict['situations'] = situations
        result_dict['activities'] = activities
        pps = parse_tree.get_pps()
        relations = []
        relation_nodes = set()
        for pp in pps:
            for n in toponyms:
                if 'with' in pp.word.lower():
                    is_within = is_within_phrase(pp.word)
                    if is_within is not None:
                        in_pp = pp.get_in_in_pp()
                        if in_pp is not None:
                            relations.append(in_pp.word)
                            in_pp.role = 'r'
                            relation_nodes.add(in_pp)
                if n in pp.word and not is_inside_right(
                        pp.word, entities) and not is_inside_right(
                            pp.word, a_entities):
                    in_pp = pp.get_in_in_pp()
                    if in_pp is not None:
                        relations.append(in_pp.word)
                        in_pp.role = 'r'
                        relation_nodes.add(in_pp)
                        break
            for t in types:
                if t in pp.word:
                    in_pp = pp.get_in_in_pp()
                    if in_pp is not None:
                        relations.append(in_pp.word)
                        in_pp.role = 'r'
                        relation_nodes.add(in_pp)
                        break
        all_nodes = all_nodes.union(relation_nodes)
        all_desc_nodes = all_desc_nodes.union(relation_nodes)
        for relation in relations:
            logging.info('\trelation: {}'.format(relation))
        result_dict['relations'] = relations

        adjs = parse_tree.get_adjectives()
        qualities = []
        qualities_nodes = set()
        object_qualities = []
        object_qualities_nodes = set()
        for adj in adjs:
            siblings = adj.get_siblings()
            for sibling in siblings:
                if is_inside(sibling.word, toponyms) or is_inside(
                        sibling.word, types) or is_inside(
                            sibling.word, a_types):
                    if not is_inside(adj.word, types) and not is_inside(
                            adj.word, a_types):
                        qualities.append(adj.word)
                        adj.role = 'q'
                        qualities_nodes.add(adj)
                        break
                elif is_inside(sibling.word, entities) or is_inside(
                        sibling.word, a_entities):
                    object_qualities.append(adj.word)
                    adj.role = 'p'
                    object_qualities_nodes.add(adj)
                    break
        all_nodes = all_nodes.union(qualities_nodes)
        all_desc_nodes = all_desc_nodes.union(qualities_nodes)
        all_nodes = all_nodes.union(object_qualities_nodes)
        all_desc_nodes = all_desc_nodes.union(object_qualities_nodes)
        for q in qualities:
            logging.info('\tquality: {}'.format(q))
        for oq in object_qualities:
            logging.info('\tobject quality: {}'.format(oq))
        result_dict['pqualities'] = qualities
        result_dict['oqualities'] = object_qualities
        # coding schema: where: 1, what: 2, which: 3, why: 4, how: 5, how+adj: 6 etc. make it complete... other:0...
        # ...activity: a, situation: s, quality: q, object_quality: p, relation: r, toponym: n, type: t, date: d
        ignored_nodes = []
        leaves = parse_tree.get_leaves()
        for leaf in leaves:
            if leaf.is_unknown():
                ignored_nodes.append(leaf)

        temp = []

        for leaf in ignored_nodes:
            for n in all_nodes:
                flag = True
                if n.is_fuzzy_matched:
                    if leaf.word in n.word:
                        flag = False
                        break
                else:
                    if n.is_your_child(leaf):
                        flag = False
                        break
            if flag:
                temp.append(leaf)
                all_nodes.add(leaf)
        # ignored_nodes = temp

        all_list = list(all_nodes)
        intent_list = list(all_intent_nodes)
        description_list = list(all_desc_nodes)
        all_list.sort(key=lambda x: x.index, reverse=False)
        intent_list.sort(key=lambda x: x.index, reverse=False)
        description_list.sort(key=lambda x: x.index, reverse=False)
        intent_code = ''
        intent_info = []
        for node in intent_list:
            intent_code += node.role
            if node.is_fuzzy_matched:
                intent_info.append({
                    'tag': node.role,
                    'value': node.fuzzy_word
                })
            else:
                intent_info.append({'tag': node.role, 'value': node.word})

        desc_code = ''
        desc_info = []
        for node in description_list:
            desc_code += node.role
            if node.is_fuzzy_matched:
                desc_info.append({'tag': node.role, 'value': node.fuzzy_word})
            else:
                desc_info.append({'tag': node.role, 'value': node.word})

        if Sentence.is_ambiguous(intent_list, intent_code):
            logging.info(
                'the intention is ambiguous, code: {}'.format(intent_code))
            resolved = Sentence.resolving_intent(desc_info)
            result_dict['resolved_intent'] = resolved
            if resolved['code'] != '':
                intent_code += resolved['code']
                intent_info.extend(resolved['list'])
                desc_temp_list = []
                for d in desc_info:
                    if d not in resolved['list']:
                        desc_temp_list.append(d)
                    else:
                        logging.debug('found!')
                desc_code = desc_code.replace(resolved['code'], '', 1)
                desc_info = desc_temp_list
                logging.debug('updated...')

        result_dict['intent_code'] = intent_code
        result_dict['intent_info'] = intent_info
        result_dict['desc_code'] = desc_code
        result_dict['desc_info'] = desc_info
        all_code = ''
        all_info = []
        for node in all_list:
            all_code += node.role
            if node.is_fuzzy_matched:
                all_info.append({'tag': node.role, 'value': node.fuzzy_word})
            else:
                all_info.append({'tag': node.role, 'value': node.word})
        result_dict['all_code'] = all_code
        result_dict['all_info'] = all_info
        logging.info('\tintent code is: {}'.format(intent_code))
        logging.info('\tdesc code is: {}'.format(desc_code))
        logging.info('\tall code is: {}'.format(all_code))
        logging.info('*******************************************************')
        return result_dict
Beispiel #5
0
from ner import NER
from flask import Flask
from flask import request
from flask import jsonify
import os
import json

app = Flask(__name__)
ner_model=NER(os.environ['SPACY_MODEL'])

@app.route('/')
def hello():
    return "Hello World!"

@app.route("/ner", methods=["GET","POST"])
def ner_request():
    if request.method == "POST":
        req = request.get_json()
        if(req["version"]==1):
            entity_mentions=ner_model.spacy_ner(req["content"])
            return jsonify({"entities": entity_mentions})
        else:
            return "The current version is not supported."
    elif request.method == "GET":
        return "Hello World from NER!"

if __name__ == '__main__':
    app.run(host="0.0.0.0")


#JSON
Beispiel #6
0
def lambda_handler(event, context):
  recognized_reviews = NER(event, False).recognize_words() # recognize the words
  final_json = Sentiment(recognized_reviews, 0.5,0.5, 0.2).calculate_final_json() # return the top words
  return final_json
Beispiel #7
0
import utils
import time
import re
import os
from ner import NER
from email_sender import send_mail

extractor = NER()
pd = utils.pd

import run_api
xls = pd.ExcelFile('strings stems.xlsx')
first = pd.read_excel(xls, 'first').dropna(axis=1, how='all').dropna(axis=0,
                                                                     how='all')
parent_second = pd.read_excel(xls, 'parent second').dropna(
    axis=1, how='all').dropna(axis=0, how='all')
tutoring = pd.read_excel(xls, 'Tutoring').dropna(axis=1,
                                                 how='all').dropna(axis=0,
                                                                   how='all')
bad_keywords = pd.read_excel(xls, 'bad keywords').dropna(
    axis=1, how='all').dropna(axis=0, how='all')
at_least_another = pd.read_excel(xls, 'at least another').dropna(
    axis=1, how='all').dropna(axis=0, how='all')

payments = pd.read_excel(xls, 'payments').dropna(axis=1,
                                                 how='all').dropna(axis=0,
                                                                   how='all')
contract = pd.read_excel(xls, 'contract').dropna(axis=1,
                                                 how='all').dropna(axis=0,
                                                                   how='all')
Beispiel #8
0
 def namedEntityRecognition(self):
     ne = NER(self.original_query)
     self.entities = ne.performNER()
     self.named_entities = [ne.lower() for ne in self.entities]
Beispiel #9
0
	def namedEntityRecognition(self):
		ne = NER(self.original_query)
		self.entities = ne.performNER()
		self.named_entities = [ne.lower() for ne in self.entities]
Beispiel #10
0
def find_events(question):
    return NER.extract_events(question)
Beispiel #11
0
def find_toponyms(question):
    return NER.extract_place_names(question)
Beispiel #12
0
def find_dates(question):
    return NER.extract_dates(question)
    def read(self, textFile, mentionsFile):
        """Read files containing the text and mentions, returning an object of them having 'text' and 'cluster list' as attributes"""
 
        ls = []
        text = ""
        mentions = ""
        tuples = []
        mentionClustersList = []
        clusterCount = 0
        mentionToClusterMap = {}
        with open(self.__path + textFile) as f_text:
            text = f_text.read()
        temp = text.splitlines()

        if len(temp[-1]) == 0:
            temp.pop()

        text = " ".join(temp)
        self.__parser.process(text)
        dependenciesList = self.__parser.getDependencies()
        
        print "Index Map to be used when creating mentions file:"
        for i, j in enumerate(text):
            print i, j
        raw_input("\nPlease enter the indices of the mentions in the mentions file: <Press enter to continue process>")

        with open(self.__path + mentionsFile) as f_mention:
            mentions = f_mention.read()
        ls = mentions.splitlines()

        if len(ls[-1]) == 0:
            ls.pop()
        
        for line in ls:
            line = line.split()
            new_tuple = int(line[0]), int(line[1])
            tuples.append(new_tuple)

        for element in tuples:
            left = element[0]
            right = element[1]
            mentions = text[left:right]
            mentionClustersList.append(Cluster(mentions, element))
            mentionToClusterMap[element] = mentionClustersList[-1]
            clusterCount = clusterCount + 1
        
        n = NER()
        print"\nThis may take some time. Please wait...\n"
        n.process(self.__path + textFile)
        NERMap = n.getNERRelations()
        NESet = set(NERMap.keys())
        feature = Features(dependenciesList)
        self.__document.setText(text)
        self.__document.setTextPath(self.__path + textFile)
        self.__document.setMentionClustersList(mentionClustersList)
        self.__document.setFeatures(feature.getAppositiveRelations(), feature.getCopulativeRelations())
        self.__document.setMentionToClusterMap(mentionToClusterMap)
        self.__document.setParserObject(self.__parser)
        self.__document.setNERMap(NERMap)
        self.__document.setNESet(NESet)
        return self.__document
from config import FoundationTrilogy
from preprocess import preprocess
from ner import NER
from entity_connections import LINK_ENTITIES
from normalization import normalize_list

if __name__ == '__main__':

    text = FoundationTrilogy
    parsed_list = preprocess(text)  # vector of preprocessed sentences
    predicted = NER(parsed_list)

    people_links, location_links, events = LINK_ENTITIES(parsed_list, predicted)
    people_links = normalize_list(people_links)
Beispiel #15
0
class Classification:
    nlp = None
    ner = None
    topicModel = None
    abreviacoesList = [['próx', 'próximo'], ['próx.', 'próximo'],
                       ['prox', 'próximo'], ['prox.', 'próximo'],
                       ['px', 'próximo'], ['px.',
                                           'próximo'], ['av', 'Avenida'],
                       ['av.', 'Avenida'], ['pça',
                                            'Praça'], ['sent', 'sentido'],
                       ['sent.', 'sentido'], ['dª', 'Dona'], ['dª.', 'Dona'],
                       ['d.ª', 'Dona'], ['sta', 'Santa'], ['vdt', 'Viaduto'],
                       ['vdt.', 'Viaduto'], ['c\\', 'com'], ['p\\', 'para'],
                       ['nº', 'número'], ['ref', 'referência'],
                       ['ref.', 'referência'], ['elv', 'Elevado'],
                       ['sra', 'Senhora'], ['gde', 'Grande'],
                       ['prof', 'Professor'], ['prof.', 'Professor'],
                       ['vtr', 'viatura'], ['vtr.', 'viatura'], ['r.', 'Rua']]

    def __init__(self, gazetteer, datasetFile, annotatedEntities,
                 vocabularyFile):
        self.nlp = NLP()
        self.ner = NER(gazetteer, annotatedEntities)
        self.topicModel = TopicClassification(datasetFile, vocabularyFile)

    def preprocessing(self, sentence):

        newSentence = sentence
        newSentence = re.sub('R\.', 'Rua', newSentence)

        #Remover termos ['RT', 'Km\h', 'km' , 'mm', 13h40, 30 min]
        newSentence = re.sub('\d+\s*km/h', ' ', newSentence, flags=re.I)
        newSentence = re.sub('\d+\s*km', ' ', newSentence, flags=re.I)
        newSentence = re.sub('\d+h\d+', ' ', newSentence, flags=re.I)
        newSentence = re.sub('\d+h ', ' ', newSentence, flags=re.I)
        newSentence = re.sub('\d+hrs ', ' ', newSentence, flags=re.I)
        newSentence = re.sub('\d+\s*mm', ' ', newSentence)
        newSentence = re.sub('\s*RT ', ' ', newSentence)
        newSentence = re.sub('\d+\s*min\s', ' ', newSentence)
        newSentence = re.sub(r'\s(\w+)…', ' ', newSentence)

        #BR 040 para BR040
        p = re.compile('BR\s*\d+')
        lista = p.findall(newSentence)
        for item in lista:
            newSentence = newSentence.replace(item, item.replace(' ', ''))

        #Remover Urls
        newSentence = re.sub(
            r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ',
            newSentence)

        #Remover hashtags e mençoes de usuários @user
        newSentence = re.sub(r"(?:\@|https?\://)\S+", " ", newSentence)
        newSentence = re.sub(r"(#.*?)\s", " ", newSentence)

        #Remover caracteres especiais
        p = re.compile('(ª|º)')
        newSentence = p.sub(' ', newSentence)
        newSentence = re.sub('\W', ' ', newSentence)

        #Remover numeros
        newSentence = re.sub(' \d+', ' ', newSentence).lstrip()

        #Remover pontuação
        for pontuacao in string.punctuation:
            newSentence = newSentence.replace(pontuacao, ' ')

        #Expandir abreviações
        wordsList = newSentence.lower().split(" ")
        for word in self.abreviacoesList:
            if (word[0] in wordsList):
                newSentence = re.sub(word[0], word[1], newSentence, flags=re.I)

        #Removendo espaços extras
        newSentence = re.sub(' +', ' ', newSentence)
        return newSentence

    def classify(self, sentence):
        newSentence = self.preprocessing(sentence)
        sentenceTokens = self.nlp.tokenization(newSentence)
        results = self.ner.dictionaryNER(sentenceTokens)
        labelIOB = results[0]
        coordinate = results[1]
        topic = self.topicModel.predictTopic(newSentence)
        labelIOBnew = self.mergeResults(sentence, newSentence, labelIOB)
        return ([labelIOB, labelIOBnew, coordinate, topic])

    def mergeResults(self, sentence, preprocessedSentence, labelIOB):
        #print("\n" + sentence)
        #print(preprocessedSentence)

        #sentenceTokens = self.nlp.tokenization(sentence)
        newSentenceTokens = sentence.split(" ")
        preprocessedSentenceTokens = self.nlp.tokenization(
            preprocessedSentence)
        abreviacoes = list(map(operator.itemgetter(0), self.abreviacoesList))

        #newSentenceTokens = sentenceTokens
        #~ for idx, token in enumerate(sentenceTokens):
        #~ if (token == '.' and idx-1 > 0 and sentenceTokens[idx-1].lower() == 'av'):
        #~ newSentenceTokens.pop(idx)
        #~ newSentenceTokens[idx-1] = newSentenceTokens[idx-1]+'.'

        index = 0
        newLabelIOB = ""
        skipLoop = False
        for idx, item in enumerate(newSentenceTokens):
            if (skipLoop):
                skipLoop = False
                continue

            newItem = item
            for pontuacao in string.punctuation:
                newItem = newItem.replace(pontuacao, '')
            #print (newItem)

            if (index < len(preprocessedSentenceTokens)
                    and item not in string.punctuation):
                #print (newItem + " vs " + preprocessedSentenceTokens[index])
                if (newItem == preprocessedSentenceTokens[index]
                        or item.lower() in abreviacoes):
                    #print("<>> " + labelIOB[index])
                    newLabelIOB = newLabelIOB + labelIOB[index]
                    index += 1

                #palavras compostas
                elif (index + 1 < len(preprocessedSentenceTokens)
                      and item.find(preprocessedSentenceTokens[index]) != -1
                      and
                      item.find(preprocessedSentenceTokens[index + 1]) != -1):
                    #print (">>>>>" + labelIOB[index] + " vs "+ labelIOB[index+1] + " = " + self.ner.andOperationIOB(labelIOB[index], labelIOB[index+1]))
                    newLabelIOB = newLabelIOB + self.ner.andOperationIOB(
                        labelIOB[index], labelIOB[index + 1])
                    index += 2

                #BR 262 e BR262
                elif (idx + 1 < len(newSentenceTokens)
                      and preprocessedSentenceTokens[index].find(item) != -1
                      and preprocessedSentenceTokens[index].find(
                          newSentenceTokens[idx + 1]) != -1):
                    #print (">>>>>" + labelIOB[index] + " = " + labelIOB[index])
                    if (labelIOB[index] == 'B'):
                        newLabelIOB = newLabelIOB + labelIOB[index] + 'I'
                    else:
                        newLabelIOB = newLabelIOB + labelIOB[index] + labelIOB[
                            index]
                    index += 1
                    skipLoop = True

                else:
                    #print('O')
                    newLabelIOB = newLabelIOB + 'O'
            else:
                #print('O')
                newLabelIOB = newLabelIOB + 'O'

        if (len(newSentenceTokens) != len(newLabelIOB)):
            print("::::ERROO::: Size senten:: " + str(len(newSentenceTokens)) +
                  " / size label:: " + str(len(newLabelIOB)))

        newLabelIOB = self.excessoes(newLabelIOB, newSentenceTokens)
        #print (self.extractAnnotatedEntities(labelIOB, preprocessedSentenceTokens))
        #print (self.extractAnnotatedEntities(newLabelIOB, newSentenceTokens))
        #print (preprocessedSentenceTokens)
        #print (newSentenceTokens)
        return newLabelIOB

    def excessoes(self, newLabelIOB, sentenceTokens):
        lastB = 0
        for idx, token in enumerate(sentenceTokens):

            if (newLabelIOB[idx] == 'B'): lastB = idx

            #Sequencia tipo BOI devido à pontuacao
            if (idx + 1 < len(newLabelIOB) and token in string.punctuation
                    and (idx - lastB) <= 2 and newLabelIOB[idx + 1] == "I"):
                newLabelIOB = newLabelIOB[0:idx] + "I" + newLabelIOB[
                    idx + 1:len(newLabelIOB)]

            #Sequencia BOOI devido a termos removidos do pre-processamento
            elif (newLabelIOB[idx] == "I" and (idx - lastB) > 2
                  and newLabelIOB[idx - 1] == 'O'):
                newLabelIOB = newLabelIOB[0:lastB] + "O" + newLabelIOB[
                    lastB + 1:len(newLabelIOB)]
                newLabelIOB = newLabelIOB[0:idx] + "O" + newLabelIOB[
                    idx + 1:len(newLabelIOB)]
        return newLabelIOB

    def extractAnnotatedEntities(self, patternLabel, sentenceTokens):
        occurrences = re.findall('(BI*)', patternLabel)
        entities = []
        newPattern = patternLabel
        indices = []
        for indx, occurrence in enumerate(occurrences):
            indexStart = newPattern.find(occurrences[indx])
            indices.append([indexStart, indexStart + len(occurrences[indx])])
            subs = ""
            for i in range(len(occurrences[indx])):
                subs = subs + 'X'
            newPattern = newPattern.replace(occurrences[indx], subs, 1)

        termo = []
        for i, idx in enumerate(indices):
            for position in range(idx[0], idx[1]):
                termo.append(sentenceTokens[position])
            entities.append(" ".join(termo).upper())
            termo = []
        return entities

    def teste(self):

        print(
            self.classify(
                '20h37 (+) / R. Pitangui / R. Alabastro / Av. Silviano Brandão / Av. Flávio dos Santos.'
            ))
        #return
        print(self.classify('@g1 era porcelanato pelo menos?'))

        print(
            self.classify(
                'RT @g1: Luta contra leucemia vai exigir que aluna faça #Enem2016 no hospital https://t.co/aZV9zIvp1l #G1 https://t.co/WDwwzbk5h4'
            ))

        print(
            self.classify(
                'Operação especial na rodoviária (TERGIP), de 5/2 a 13/2, para o feriado do Carnaval 2016. https://t.co/vVIl36tG6A'
            ))

        print(
            self.classify(
                'RT @defesacivilbh: 15h46 - Risco de chuva (20 a 40 mm), raios e ventos (até 50 km/h). Até 23h59 de terça (16). @Bombeiros_MG #BH https://t.…'
            ))

        print(
            self.classify(
                'RT @Bombeiros_MG: 21h - Árvore de grande porte caída na BR 262 (Anel Rod), px à PUC São Gabriel, pista obstruída. Risco de colisões. 1 vtr …'
            ))

        print(
            self.classify(
                '@diih__campos Boa Tarde! O Quadro de Horários de segunda-feira corresponde ao de dia atípico e terça ao de feriado.'
            ))

        print(
            self.classify(
                'RT @defesacivilbh: Acréscimo de 20 a 30mm do alerta emitido totalizando 70mm até 7h de terça (24) raios e rajadas vento de até 50 km/h. htt…'
            ))

        print(
            self.classify(
                'Criação da Linha 825 (Estação São Gabriel / Vitória II via UPA Nordeste) a partir de domingo, dia 21/2. Confira: https://t.co/PV2OQkx10H'
            ))

        print(
            self.classify(
                'RT @PRF191MG: 10h30 - RETIFICAÇÃO: BR040 negociação ficou decidido pistas principais  ficarão liberadas por 30 min e depois serão fechadas …'
            ))

        print(
            self.classify(
                'Participe da 5ª Reunião do Observatório da Mobilidade Urbana de BH! Inscrições pelo link https://t.co/bMsvjwaLZZ'
            ))

        print(
            self.classify(
                '@dannymendes10 Boa Noite! Nossa equipe esteve no local e constatou a presença da CEMIG. Local sem energia elétrica.'
            ))
Beispiel #16
0
 def __init__(self, gazetteer, datasetFile, annotatedEntities,
              vocabularyFile):
     self.nlp = NLP()
     self.ner = NER(gazetteer, annotatedEntities)
     self.topicModel = TopicClassification(datasetFile, vocabularyFile)