Example #1
0
def segment(txt):  #结合NER的分词
    ner = [i[3] for i in fool.analysis(txt)[1][0]]
    words = []
    for i in ner:
        txt = txt.replace(i[0], '|||' + i[0]).replace(i[-1], i[-1] + '|||')
    txt = txt.split('|||')
    for i in txt:
        if i in ner:
            words.append(i)
        else:
            for j in fool.analysis(i)[0][0]:
                words.append(j[0])
    return words
Example #2
0
def pos_tag():
    try:
        data = json.loads(request.get_data(as_text=True))
        question = data['query']
    except TypeError as te:
        log.error(te)
        return get_result_response(
            EasyDict({
                'code': ResponseStatus.TYPE_ERROR,
                'msg': 'The server received a parameter error'
            }))
    except Exception as e:
        log.error(e)
        return get_result_response(
            EasyDict({
                'code':
                ResponseStatus.OTHER,
                'msg':
                'The server receives the parameter and sends an unknown error'
            }))
    log.info("ip:{}".format(request.remote_addr))
    out = fool.analysis(question)
    answer = [p[0] for p in out[0][0]]
    tag = [p[1] for p in out[0][0]]
    log.info("question:{}".format(question))
    log.info("answer:{}".format(out))
    return get_result_response(
        EasyDict({
            'code': ResponseStatus.SUCCESS,
            'msg': 'Success',
            'answer': answer,
            'tag': tag
        }))
Example #3
0
 def analysis(self, article, create_time, crawling_time, mongo_id):
     '''
     :param article:  文章 ["本报讯.. ', ""],
     :return:
     '''
     # a append追加文件 w:写入文件 r:读文件
     # fw = open("./data/ner_out.txt", "a", encoding="utf-8")
     for a in article:
         each_article = a.replace("\n", "")
         if len(article) != 0:
             words, ner = fool.analysis(each_article)
             for i in ner:
                 if i[2] in self.permit_name:
                     i = list(i)
                     if i[2] == 'company':
                         i[2] = 'org'
                     elif i[2] == 'location':
                         i[2] = 'place'
                 # str(i[2])
                     ner_text = str(i[3])
                     r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
                     ner_text = re.sub(r, '', ner_text)
                     print(
                         str(i[2]) + 'NER' + ner_text + '\t' +
                         'create_time' + create_time + '&' + crawling_time +
                         'ID:' + mongo_id + '\n')
Example #4
0
def word2vec(line):
    word2id_list = [0] * len(vocab_dict)
    entities = {}
    for x in person_names:
        if x in line:
            line = line.replace(x, " nnt ")
            entities[0] = x
    for x in movie_names:
        if x in line:
            line = line.replace(x, " nm ")
            entities[1] = x
    for x in genre_names:
        if x in line:
            line = line.replace(x, " ng ")
            entities[2] = x
    words, ner = fool.analysis(line)
    for entity in ner[0]:
        if (entity[2] == "person" or entity[2] == "company"):
            line = line.replace(entity[3], " nnt ")

    for word in fool.cut(line)[0]:
        # for word in list(jieba.cut(line)):
        try:
            word2id_list[int(vocab_dict[word])] = 1
        except:
            pass
    return word2id_list, entities
Example #5
0
    def analysis(article):
        '''
        :param article:  文章 ["本报讯.. ', ""],
        :return:
        '''
        # a append追加文件 w:写入文件 r:读文件
        fw = open("./data/ner_out.txt", "a", encoding="utf-8")
        for a in article:
            each_article = a.replace("\n", "")
            if len(article) != 0:
                words, ner = fool.analysis(each_article)
                for i in ner:
                    if i[2] == 'org' or i[2] == 'company' or i[2] == 'person' or i[2] == 'location':
                        i = list(i)
                        if i[2] == 'company':
                            i[2] = 'org'
                        elif i[2] == 'location':
                            i[2] = 'place'

                        ner_end = str(i[2:4])
                        # print(i[2:4])
                           # print(str(i[2:4]) + '\n')
                        fw.write(ner_end + '\n')

        fw.close()
Example #6
0
def predict(date, content): 
    print("GET data:-----------")
    print(date, content)
    data = [get_data(content)]
    base_shape = [[len(c) for c in data]] # 获取每句话的单词数量

    # 生成预测数据
    tensor_words = fluid.create_lod_tensor(data, base_shape, place)

    # 执行预测
    result = exe.run(program=infer_program,
                     feed={feeded_var_names[0]: tensor_words},
                     fetch_list=target_var)

    # 输出结果
    Dict = {'content':content, "rumor":1, "location":"", "org":"", "company":"", "person":"", "job":""}
    lab = np.argsort(result)[0][0][-1] # 获取结果概率最大的label
    Dict['rumor'] = lab
    words, ners = fool.analysis(content)
    for entity in ners[0]:
        if 'location' in entity:
            Dict['location'] = entity[3]
        # elif 'time' in entity:
        #     Dict['time'] = entity[3]
        elif 'org' in entity:
            Dict['org'] = entity[3]
        elif 'company' in entity:
            Dict['company'] = entity[3]
        elif 'person' in entity:
            Dict['person'] = entity[3]
        elif 'job' in entity:
            Dict['job'] = entity[3]
    return Dict
Example #7
0
def draw_wordcloud(file):
    print(file)
    #读入一个txt文件
    text = ''.join([x.strip() for x in open(file).readlines()])
    ners = fool.analysis(text)[1][0]
    comment_text = [x[3] for x in ners if x[2] in ['org', 'company']]
    #comment_text = open(path,'r').read()
    #结巴分词,生成字符串,如果不通过分词,无法直接生成正确的中文词云
    #cut_text = " ".join(jieba.cut(comment_text))
    cut_text = ' '.join(comment_text)
    d = path.dirname(__file__)  # 当前文件文件夹所在目录
    color_mask = imread("/Users/ajmd/Desktop/timg.jpeg")  # 读取背景图片
    cloud = WordCloud(
        #设置字体,不指定就会出现乱码
        #font_path="/Users/ajmd/Desktop/1252935991/CUHEISJ.TTF",
        font_path="/Users/ajmd/Desktop/simsunttc/simsun.ttc",
        #font_path=path.join(d,'simsun.ttc'),
        #设置背景色
        background_color='white',
        #词云形状
        mask=color_mask,
        #允许最大词汇
        max_words=2000,
        #最大号字体
        max_font_size=40)
    word_cloud = cloud.generate(cut_text)  # 产生词云
    print("Done")
    word_cloud.to_file("./ads_cloud4.jpg")  #保存图片
    #  显示词云图片

    plt.imshow(word_cloud)
    plt.axis('off')
    plt.show()
Example #8
0
    def get_entity_recognit_sentence(self, sentence, entity, dialogic_flag=False,input_filter=True,sentence_context=False):
        '''
        得到单句实体识别结果
        :param sentence: 句子
        :param entity: 实体存储类
        :param dialogic_flag: 对话标记,True标识是新对话,False标识是原对话
        :return: 更新后的实体存储类
        '''
        try:
            if input_filter:
                sentence = optimize_func.input_filter(sentence, 'sentence')
            # 利用地址库识别地址
            NE_dic = Entity_recognition.NE.sent_extract(sentence, dialogic_flag)
            entity.ID = NE_dic["ID"]
            entity.money = NE_dic["money"]
            if config.address_lib:
                entity = optimize_func.address_search(entity, sentence)
            if sentence_context:
                sentence = optimize_func.sentence_history(sentence, dialogic_flag)
            # print('***********************')
            # print("sentence ", sentence)

            # 命名实体识别
            words, ners = fool.analysis(sentence)

            # print('ners:',ners)

            Logger.log_DEBUG.debug('实体识别结果')
            Logger.log_DEBUG.debug(ners)
            if len(ners[0]) >= 1:
                for ne in ners[0]:
                    ne3_temp = ne[3].replace(' ', '')
                    if ne3_temp in entity.location:
                        continue
                    if ne[2] == 'time' and ne3_temp not in entity.time:
                        entity.time.append(ne3_temp)
                    elif ne[2] == 'location' and ne3_temp not in entity.location:
                        ne3_temp_fil = ne3_temp.replace('呃','').replace('那个','').replace('哪个','').replace('啊','')
                        if len(ne3_temp_fil)>1:
                            entity.location.append(ne3_temp_fil)
                    elif ne[2] == 'person' and ne3_temp not in entity.person:
                        ne3_temp_fil = ne3_temp.replace('呃', '').replace('那个', '').replace('哪个', '').replace('啊', '')
                        if len(ne3_temp_fil)>1:
                            entity.person.append(ne3_temp_fil)
                    elif ne[2] == 'company' and ne3_temp not in entity.company:
                        ne3_temp_fil = ne3_temp.replace('呃', '').replace('那个', '').replace('哪个', '').replace('啊', '')
                        if len(ne3_temp_fil)>1:
                            entity.company.append(ne3_temp_fil)

            # print('entity.location:',entity.location)
            # print('***********************')

            if config.is_filter:
                entity = optimize_func.filter_result(sentence,entity)
            return entity
        except Exception as e:
            s = "得到单句实体识别结果时发生异常get_entity_recognit_sentence" + str(e)
            Logger.log_ERROR.error(s)
            Logger.log_ERROR.exception(sys.exc_info())
            raise TypeError(s)
Example #9
0
def get_all_relevant_things(doc_dict: dict):
    """
    find relevant person and relevant organization for all docs
    args:
        doc_dict: dict, {'docid': 'docline', ...}
    return:
        relevant_person: dict, {'docid': {'person': num shown, ...}, ...}
        relevant_org: dict, {'docid': {'org': num shown, ...}, ...}
    """
    relevant_person = dict()
    relevant_org = dict()
    c = 0
    for docid in doc_dict:
        c += 1
        if c % 500 == 0:
            print(c, "docs done at", time.asctime(time.localtime(time.time())))
        doc_line = doc_dict[docid]
        _docid, doc_title, _doc_date, _doc_url, doc_text = doc_line.split(
            '|||')
        _words, ners = fool.analysis([doc_title, doc_text])
        ners = ners[0] + ners[1]
        person_list = [t4[3] for t4 in ners if t4[2] == 'person']
        org_list = [t4[3] for t4 in ners if t4[2] == 'org']
        relevant_person[docid] = Counter(person_list)
        relevant_org[docid] = Counter(org_list)
    return relevant_person, relevant_org
Example #10
0
def slot_fill(sentence, key=None):

    slot = {}
    words, ners = fool.analysis(sentence)
    to_city_flag = 0
    for ner in ners[0]:

        if ner[2]=='time':

            date_content = re.findall(r'后天|明天|今天|大后天|周末|周一|周二|周三|周四|周五|周六|周日|本周一|本周二|本周三|本周四|本周五|本周六|本周日|下周一|下周二|下周三|下周四|下周五|下周六|下周日|这周一|这周二|这周三|这周四|这周五|这周六|这周日|\d{,2}月\d{,2}号|\d{,2}月\d{,2}日', ner[-1])
            slot["date"] = date_content[0] if date_content else ""

            time_content = re.findall(r'\d{,2}点\d{,2}分|\d{,2}点钟|\d{,2}点', ner[-1])

            pmam_content = re.findall(r'上午|下午|早上|晚上|中午|早晨', ner[-1])
            slot["time"] = pmam_content[0] if pmam_content else "" + time_content[0] if time_content else ""

        if ner[2]=='location':

            if key is None:
                if re.findall(r'(到|去|回|回去)%s'%(ner[-1]), sentence):
                    to_city_flag = 1
                    slot["to_city"] = ner[-1]
                    continue
                if re.findall(r'从%s|%s出发'%(ner[-1], ner[-1]), sentence):
                    slot["from_city"] = ner[-1]
                elif to_city_flag==1:
                    slot["from_city"] = ner[-1]

            elif key in ["from_city", "to_city"]:
                slot[key] = ner[-1]

    return slot
Example #11
0
def get_keywords(line):
    '''
    获取关键词
    '''
    fool.analysis
    load_dict('F:\\114代码\\i\\wordSegment\\kw.txt')
    word, keywords = fool.analysis(line)
    return [keyword[3] for keyword in keywords[0]]
Example #12
0
def tcut():
    text = "我在北京天安门"
    words, ners = fool.analysis(text)
    print(ners)
    words = fool.pos_cut(text)
    print(words)
    fool.delete_userdict()
    print(fool.cut(text))
Example #13
0
def fool_recognize(text):

    # ners = fool.ner(text)
    words, ners = fool.analysis(text)
    ners = ners[0]
    # print(ners)
    # ners = [(x[0], x[1], x[2], x[3].strip()) for x in ners]
    return ners
Example #14
0
def placeSingle(text):
    placeList = []
    placeLists = []
    provinceList = []
    longitudeList = []
    latitudeList = []
    words, ners = fool.analysis(text)
    for itemSun in ners[0]:
        if itemSun[2] == 'location':
            title_str = [itemSun[3]]
            df = cpca.transform(title_str, cut=False)
            placenmpy = df.values
            places = placenmpy[0][0] + placenmpy[0][1] + placenmpy[0][2]  #发生地点
            if places != '':
                if placenmpy[0][0] == placenmpy[0][1]:
                    places = placenmpy[0][1] + placenmpy[0][2]
                placeList.append(places)
                provinceList.append(placenmpy[0][0])
            else:
                places = geocoder.arcgis(itemSun[3])
                if places.latlng == None:
                    continue
                placeLists.append(places.address)
                longitudeList.append(str(round(places.lng, 2)))
                latitudeList.append(str(round(places.lat, 2)))
    if placeList != []:
        placedomestic = placeList.count(max(placeList, key=placeList.count))
    else:
        placedomestic = 0
    if placeLists != []:
        placeforeign = placeLists.count(max(placeLists, key=placeLists.count))
    else:
        placeforeign = 0
    if placedomestic == 0 and placeforeign == 0:
        place = ''
        longitude = ''
        latitude = ''
        province = ''
        country = ''
        return place, longitude, latitude, province, country
    if placedomestic >= placeforeign:
        place = max(placeList, key=placeList.count)
        llat = geocode(place)
        longitude = llat[0]  #地点经度
        latitude = llat[1]
        indexdata = placeList.index(place)
        country = '中国'
        province = provinceList[indexdata]
        place = '中国' + place
        return place, longitude, latitude, province, country
    else:
        place = max(placeLists, key=placeLists.count)
        indexdata = placeLists.index(place)
        country = ''
        province = ''
        longitude = longitudeList[indexdata]
        latitude = latitudeList[indexdata]
        return place, longitude, latitude, province, country
Example #15
0
    def fun_clean(self, sentence):


        words, ners = fool.analysis(sentence)

        ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True)
           if ners:
            for ner in ners:
                sentence = sentence.replace(ner[-1], ' ' + ner[2] + ' ')
Example #16
0
def slot_fill(sentence, key=None):
    """
    填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作)
    :param sentence:
    :return slot: (填槽的结果)
    """
    slot = {}
    # 进行实体识别
    words, ners = fool.analysis(sentence)
    print(ners)
    to_city_flag = 0  # flag为1代表找到到达城市(作用:当找到到达城市时,默认句子中另一个城市信息是出发城市)

    for ner in ners[0]:
        # 首先对time类别的实体进行信息抽取填槽工作
        if ner[2] == 'time':
            # --------------------
            # 寻找日期的关键词
            date_content = re.findall(
                r'后天|明天|今天|大后天|周末|周一|周二|周三|周四|周五|周六|周日|本周一|本周二|本周三|本周四|本周五|本周六|本周日|下周一|下周二|下周三|下周四|下周五|下周六|下周日|这周一|这周二|这周三|这周四|这周五|这周六|这周日|\d{,2}月\d{,2}号|\d{,2}月\d{,2}日',
                ner[-1])
            slot["date"] = date_content[0] if date_content else ""
            # 完成日期的填槽
            # --------------------

            # --------------------
            # 寻找具体时间的关键词
            time_content = re.findall(r'\d{,2}点\d{,2}分|\d{,2}点钟|\d{,2}点',
                                      ner[-1])
            # 寻找上午下午的关键词
            pmam_content = re.findall(r'上午|下午|早上|晚上|中午|早晨', ner[-1])
            slot["time"] = pmam_content[
                0] if pmam_content else "" + time_content[
                    0] if time_content else ""
            # 完成时间的填槽
            # --------------------
        # 对location类别对实体进行信息抽取填槽工作
        if ner[2] == 'location':
            # --------------------
            # 开始对城市填槽
            # 如果没有指定槽位
            if key is None:
                if re.findall(r'(到|去|回|回去)%s' % (ner[-1]), sentence):
                    to_city_flag = 1
                    slot["to_city"] = ner[-1]
                    continue
                if re.findall(r'从%s|%s出发' % (ner[-1], ner[-1]), sentence):
                    slot["from_city"] = ner[-1]
                elif to_city_flag == 1:
                    slot["from_city"] = ner[-1]
            # 如果指定了槽位
            elif key in ["from_city", "to_city"]:
                slot[key] = ner[-1]
            # 完成出发城市、到达城市的填槽工作
            # --------------------

    return slot
def fool_ner(text):  # 识别机构名-foolnltk 返回公司名列表
    words, ners = fool.analysis(text)
    company = []
    for ner_list in ners:
        for tups in ner_list:
            if tups[2] == 'company':
                temp = re.sub(' ', '', tups[3])
                company.append(temp)
    company = list(set(company))
    return company
Example #18
0
def processSentence(sentence):
    #print(fool.cut(sentence))
    #print(fool.pos_cut(sentence))
    try:
        print(fool.cut(sentence))
        print(fool.pos_cut(sentence))
        words, ners = fool.analysis(sentence)
        print(words,ners)
    except:
        pass
Example #19
0
def location_parse(question):
    # 实体识别(地区)
    words, ners = fool.analysis(question)
    location_list = []
    for i in ners:
        for j in i:
            if j[2] == 'location' and j[3] in province:
                location_list.append(['domestic', j[3]])
            if j[2] == 'location' and j[3] in foreign_country:
                location_list.append(['foreign', j[3]])
    return location_list
Example #20
0
def who_answer(s, flag):
    words, ners = fool.analysis(s)
    ansml = ''
    for ner in ners[0]:
        if flag == 1:
            if 'person' in ner[2]:
                return ner[3]
        else:
            if 'person' in ner[2]:
                ansml = ansml + ner[3]
            if 'org' in ner[2]:
                ansml = ansml + ner[3]
            if 'company' in ner[2]:
                ansml = ansml + ner[3]
    return ansml
Example #21
0
def slot_fill(sentence, key=None):
    # 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作)
    # input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息)
    # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)

    slot = {}
    # 进行实体识别
    words, ners = fool.analysis(sentence)

    """
    TODO:从sentence中寻找需要的内容,完成填槽工作
    """

    
    return slot
Example #22
0
def find_full_name(soup):
    # 找出公告发出公司
    soupcontent = re.sub('\n|\s', '', soup.get_text())
    _, nt = fool.analysis(soupcontent[0:110])
    nt = nt[0]
    fullname_ls = [x[3] for x in nt if x[2] == 'company' or x[2] == 'org']
    fullname_ls_len = [len(x) for x in fullname_ls[:min(2, len(fullname_ls))]]
    if len(fullname_ls_len):
        fullname = fullname_ls[:max(2, len(fullname_ls))][
            fullname_ls_len.index(max(fullname_ls_len))]
        if fullname:
            return (fullname)
        else:
            return ('公告里没有公司')
    return ('公告里没有公司')
Example #23
0
def analyzeInfo(item):
    address = ''
    result = {}
    a_list = item.find_all('a', limit=1)
    h3_list = item.find_all('h3', attrs={'class': 'tit'}, limit=1)
    span_link_list = h3_list[0].find_all('span', limit=1)
    div_list = item.find_all('div', attrs={'class': 'src-tim'}, limit=1)
    span1_list = div_list[0].find_all('span', attrs={'class': 'src'}, limit=1)
    span2_list = div_list[0].find_all('span', attrs={'class': 'tim'}, limit=1)
    dataCount = int(mysqlCommand.getLastId()) + 1
    result['id'] = str(dataCount)
    result['disasterid'] = '0008'  #类别:泥石流
    result['link'] = span_link_list[0]['lanmu1']
    result['title'] = a_list[0].get_text().strip()  #新闻标题
    result['releaseTime'] = re.sub("\D", "",
                                   span2_list[0].get_text().strip())  #发布时间
    result['source'] = span1_list[0].get_text().strip().replace('来源:',
                                                                '')  #新闻来源
    result['originalText'] = get_original(result['link'])  #新闻原文
    title_str = [result['originalText']]
    words, ners = fool.analysis(title_str)
    for itemSun in ners[0]:
        if itemSun[2] == 'location':
            if itemSun[3] in address:
                break
            else:
                address = address + itemSun[3] + ','
    result['place'] = address  #发生地点
    result['longitude'] = '0'  #地点经度
    result['latitude'] = '0'  #地点纬度
    result['strength'] = ''  #灾害强度
    result['occurTime'] = ''  #发生时间
    originalText = result['originalText'] + ',' + result['title']
    death = toYc.death(originalText)
    injured = toYc.Injured(originalText)
    result['injured'] = str(injured)  #受伤人数
    result['death'] = str(death)  #死亡人数
    result['loss'] = '0'  #经济损失
    result['pictures'] = ''  #多个路径之间用分号隔开
    result['more'] = ''  #特殊字段
    try:
        # 插入数据,如果已经存在就不在重复插入
        title = 'debrisFlow'
        res = mysqlCommand.insertData(result, title)
        if res:
            dataCount = res
    except Exception as e:
        print("插入数据失败", str(e))  #输出插入失败的报错语句
Example #24
0
def query(text):
    global datas
    for i in fool.analysis(text)[1]:
        for j in i:
            for i in search(j[3]):
                add_data(j[3], i[0], i[1], g)
    ss = sqlsent02.sqlsent(text)
    #print(ss,'???')
    for i in ss:
        if '的' in i:
            ss[ss.index(i)] = query(i + '是什么')
    #print(ss,'--')
    ss[ss.index('L')] = '?x'
    q = "select ?x where { " + vbose(ss[0]) + " ?x ?y}"
    x = g.query(q)
    t = 0
    st = ''
    for i in list(x):
        simi = SentAna.vector_similarity(abbr(i[0]), ss[1])
        if simi >= t:
            st = abbr(i[0])
            t = simi
    ss[1] = st
    #逻辑推导
    for subj, pred, obj in g:  #从RDF取出三元组
        assert_fact("relation", abbr(subj), abbr(pred),
                    abbr(obj))  #加入Datalog数据库
    load("relation(X,'爷爷',Z) <= relation(X,'父亲',Y) & relation(Y,'父亲',Z)"
         )  #定义推理表达式
    load("relation(Y,'孙子',X) <= relation(X,'爷爷',Y)")
    load("relation(Y,'丈夫',X) <= relation(X,'妻子',Y)")
    load("relation(Y,'儿子',X) <= relation(X,'父亲',Y)")
    load("relation(Y,'奶奶',Z) <= relation(Y,'爷爷',X)& relation(X,'配偶',Z)")
    load("relation(X,'儿媳',Z) <= relation(X,'孙子',Y)& relation(Y,'母亲',Z)")
    load("relation(X,'亲属',Y) <= relation(X,'孙子',Y)")
    load("relation(X,'亲属',Y) <= relation(X,'母亲',Y)")
    load("relation(X,'亲属',Y) <= relation(X,'奶奶',Y)")
    load("relation(X,'亲属',Y) <= relation(X,'儿子',Y)")
    load("relation(X,'亲属',Y) <= relation(X,'爷爷',Y)")
    load("relation(X,'亲属',Y) <= relation(X,'父亲',Y)")
    load("relation(X,'亲属',Y) <= relation(Y,'亲属',X)")
    for i in relation(X, R, Y):
        add_data(i[0], i[1], i[2], g)
    q = "select ?x where { " + vbose(ss[0]) + ' ' + vbose(ss[1]) + ' ' + vbose(
        ss[2]) + "}"
    x = g.query(q)
    datas = [abbr(i[0]) for i in x]
    return abbr(list(x)[0][0])
Example #25
0
 def handle_article(self, article, create_time, crawling_time, mongo_id):
     article = ''.join(article)
     words, ner = fool.analysis(article)
     # item_msg 就是代表每一条消息(起始位置, 结束位置, 类型, 内容)
     for item_msg in ner[0]:
         if item_msg[2] in self.type_list:
             type_word = list(item_msg)
             if type_word[2] == 'company':
                 type_word[2] = 'org'
             elif type_word[2] == 'location':
                 type_word[2] = 'place'
             ner_data = str(type_word[3])
             r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
             ner_text = re.sub(r, '', ner_data)
             print(type_word[2] + 'NER' + ner_text + '\t' + 'create_time' +
                   create_time + '&' + crawling_time + 'ID:' + mongo_id)
Example #26
0
def deal(sets):
    rdfs = []
    if 'SBV' in sets.keys() and 'ATT' in sets.keys():
        rdfs.append((sets['ATT'][0], sets['ATT'][1], sets['SBV'][0]))
        print(sets['ATT'][0], sets['ATT'][1], sets['SBV'][0])
    if 'SBV' in sets.keys() and 'VOB' in sets.keys():
        rdfs.append((sets['SBV'][0], sets['VOB'][1], sets['VOB'][0]))
        print(sets['SBV'][0], sets['VOB'][1], sets['VOB'][0])
    if 'ATT' in sets.keys() and 'VOB' in sets.keys():
        if sets['ATT'][1] != sets['VOB'][0]:
            rdfs.append((sets['ATT'][0], sets['ATT'][1], sets['VOB'][0]))
            print(sets['ATT'][0], sets['ATT'][1], sets['VOB'][0])
    if 'SBV' in sets.keys() and 'POB' in sets.keys():
        if fool.analysis(sets['POB'][0])[1][0][0][2] != 'time':
            rdfs.append((sets['SBV'][0], sets['SBV'][1], sets['POB'][0]))
            print(sets['SBV'][0], sets['SBV'][1], sets['POB'][0])
    return rdfs
Example #27
0
def slot_fill(sentence, key=None):
    # 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作)
    # input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息)
    # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值)

    slot = {}
    # 进行实体识别
    words, ners = fool.analysis(sentence)
    to_city_flag = 0
    for ner in ners[0]:
        if ner[2] == 'time':
            date_content = re.findall(
                r'后天|明天|今天|大后天|周末|周一|周二|周三|周四|周五|周六|周日|本周一|本周二|本周三|本周四|本周五|本周六|本周日|下周一|下周二|下周三|下周四|下周五|下周六|下周日|这周一|这周二|这周三|这周四|这周五|这周六|这周日|\d{,2}月\d{,2}号|\d{,2}月\d{,2}日',
                ner[-1])
            slot["date"] = date_content[0] if date_content else ""

            # 寻找具体时间的关键词
            time_content = re.findall(r'\d{,2}点\d{,2}分|\d{,2}点钟|\d{,2}点',
                                      ner[-1])

            pmam_content = re.findall(r'上午|下午|早上|晚上|中午|早晨', ner[-1])

            slot["time"] = pmam_content[
                0] if pmam_content else "" + time_content[
                    0] if time_content else ""

        if ner[2] == 'location':
            if key is None:
                if re.findall(r'(到|去|回|回去)%s' % (ner[-1]), sentence):
                    to_city_flag = 1
                    slot["to_city"] = ner[-1]
                    continue
                if re.findall(r'从%s|%s出发' % (ner[-1], ner[-1]), sentence):
                    slot["from_city"] = ner[-1]
                elif to_city_flag == 1:
                    slot["from_city"] = ner[-1]
            # 如果指定了槽位
            elif key in ["from_city", "to_city"]:
                slot[key] = ner[-1]
            # 完成出发城市、到达城市的填槽工作
    """
    TODO:从sentence中寻找需要的内容,完成填槽工作
    """

    return slot
Example #28
0
def tages(x):
    q = fool.analysis(x)
    s = [i[3] for i in q[1][0] if i[2] == 'company']

    #if re.findall('央|行|证券|德普|吉恩|期货|酒|美联社|路透社|伊朗|普京|律师|基金|白马股|代写投标书|[a-zA-Z]|\*|[\xa0\u3000]',str(s)):
    #   return None
    #elif re.findall('[[\u4e00-\u9fa5]{0,25}公司]',str(s)):
    #return set(s)

    for i in range(len(s)):
        s[i] = s[i].replace(' ', '').replace('\u3000', '')

    com = []
    for i in s:
        if re.findall('[\u4e00-\u9fa5][^0-9a-zA-Z_]{0,25}公司', str(i)):
            com.append(i)
        if len(set(com)) > 2 and len(set(com)) < 8:
            return set(com)
Example #29
0
 def fun_clean(self, sentence):
     """
     预处理函数
     :输入 用户输入语句:
     :输出 预处理结果:
     """
     # 使用foolnltk进行实体识别
     words, ners = fool.analysis(sentence)
     # 对识别结果按长度倒序排序
     ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True)
     # 如果有实体被识别出来,就将实体的字符串替换成实体类别的字符串(目的是看成一类单词,看成一种共同的特征)
     if ners:
         for ner in ners:
             sentence = sentence.replace(ner[-1], ' ' + ner[2] + ' ')
     # 分词,并去除停用词
     word_lst = [w for w in fool.cut(sentence)[0] if w not in stopwords]
     output_str = ' '.join(word_lst)
     output_str = re.sub(r'\s+', ' ', output_str)
     return output_str.strip()
Example #30
0
def location_parse(question):
    # 实体识别(地区)
    words, ners = fool.analysis(question)
    location_list = []
    for i in ners:
        for j in i:
            if j[2] == 'location' and j[3] in province:
                location_list.append(['domestic', j[3]])
            if j[2] == 'location' and j[3] in foreign_country:
                location_list.append(['foreign', j[3]])
            if j[2] == 'location' and j[3] in continent:
                location_list.append(['continent', j[3]])
            if j[2] == 'location' and j[3] == '中国':
                location_list.append(['china', j[3]])
    words = jieba.cut(question.encode('utf-8'))
    for word in words:
        if word in foreign_country:
            location_list.append(['foreign', word])
    return location_list
Example #31
0
#!/usr/bin/env python
# -*-coding:utf-8-*-

import fool

text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"]

print("no dict:", fool.cut(text, ignore=True))
fool.load_userdict("./test_dict.txt")
print("use dict: ", fool.cut(text))
fool.delete_userdict()
print("delete dict:", fool.cut(text))

pos_words =fool.pos_cut(text)
print("pos result", pos_words)

words, ners = fool.analysis(text)
print("ners: ", ners)

ners = fool.ner(text)
print("ners:", ners)