def segment(txt): #结合NER的分词 ner = [i[3] for i in fool.analysis(txt)[1][0]] words = [] for i in ner: txt = txt.replace(i[0], '|||' + i[0]).replace(i[-1], i[-1] + '|||') txt = txt.split('|||') for i in txt: if i in ner: words.append(i) else: for j in fool.analysis(i)[0][0]: words.append(j[0]) return words
def pos_tag(): try: data = json.loads(request.get_data(as_text=True)) question = data['query'] except TypeError as te: log.error(te) return get_result_response( EasyDict({ 'code': ResponseStatus.TYPE_ERROR, 'msg': 'The server received a parameter error' })) except Exception as e: log.error(e) return get_result_response( EasyDict({ 'code': ResponseStatus.OTHER, 'msg': 'The server receives the parameter and sends an unknown error' })) log.info("ip:{}".format(request.remote_addr)) out = fool.analysis(question) answer = [p[0] for p in out[0][0]] tag = [p[1] for p in out[0][0]] log.info("question:{}".format(question)) log.info("answer:{}".format(out)) return get_result_response( EasyDict({ 'code': ResponseStatus.SUCCESS, 'msg': 'Success', 'answer': answer, 'tag': tag }))
def analysis(self, article, create_time, crawling_time, mongo_id): ''' :param article: 文章 ["本报讯.. ', ""], :return: ''' # a append追加文件 w:写入文件 r:读文件 # fw = open("./data/ner_out.txt", "a", encoding="utf-8") for a in article: each_article = a.replace("\n", "") if len(article) != 0: words, ner = fool.analysis(each_article) for i in ner: if i[2] in self.permit_name: i = list(i) if i[2] == 'company': i[2] = 'org' elif i[2] == 'location': i[2] = 'place' # str(i[2]) ner_text = str(i[3]) r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+' ner_text = re.sub(r, '', ner_text) print( str(i[2]) + 'NER' + ner_text + '\t' + 'create_time' + create_time + '&' + crawling_time + 'ID:' + mongo_id + '\n')
def word2vec(line): word2id_list = [0] * len(vocab_dict) entities = {} for x in person_names: if x in line: line = line.replace(x, " nnt ") entities[0] = x for x in movie_names: if x in line: line = line.replace(x, " nm ") entities[1] = x for x in genre_names: if x in line: line = line.replace(x, " ng ") entities[2] = x words, ner = fool.analysis(line) for entity in ner[0]: if (entity[2] == "person" or entity[2] == "company"): line = line.replace(entity[3], " nnt ") for word in fool.cut(line)[0]: # for word in list(jieba.cut(line)): try: word2id_list[int(vocab_dict[word])] = 1 except: pass return word2id_list, entities
def analysis(article): ''' :param article: 文章 ["本报讯.. ', ""], :return: ''' # a append追加文件 w:写入文件 r:读文件 fw = open("./data/ner_out.txt", "a", encoding="utf-8") for a in article: each_article = a.replace("\n", "") if len(article) != 0: words, ner = fool.analysis(each_article) for i in ner: if i[2] == 'org' or i[2] == 'company' or i[2] == 'person' or i[2] == 'location': i = list(i) if i[2] == 'company': i[2] = 'org' elif i[2] == 'location': i[2] = 'place' ner_end = str(i[2:4]) # print(i[2:4]) # print(str(i[2:4]) + '\n') fw.write(ner_end + '\n') fw.close()
def predict(date, content): print("GET data:-----------") print(date, content) data = [get_data(content)] base_shape = [[len(c) for c in data]] # 获取每句话的单词数量 # 生成预测数据 tensor_words = fluid.create_lod_tensor(data, base_shape, place) # 执行预测 result = exe.run(program=infer_program, feed={feeded_var_names[0]: tensor_words}, fetch_list=target_var) # 输出结果 Dict = {'content':content, "rumor":1, "location":"", "org":"", "company":"", "person":"", "job":""} lab = np.argsort(result)[0][0][-1] # 获取结果概率最大的label Dict['rumor'] = lab words, ners = fool.analysis(content) for entity in ners[0]: if 'location' in entity: Dict['location'] = entity[3] # elif 'time' in entity: # Dict['time'] = entity[3] elif 'org' in entity: Dict['org'] = entity[3] elif 'company' in entity: Dict['company'] = entity[3] elif 'person' in entity: Dict['person'] = entity[3] elif 'job' in entity: Dict['job'] = entity[3] return Dict
def draw_wordcloud(file): print(file) #读入一个txt文件 text = ''.join([x.strip() for x in open(file).readlines()]) ners = fool.analysis(text)[1][0] comment_text = [x[3] for x in ners if x[2] in ['org', 'company']] #comment_text = open(path,'r').read() #结巴分词,生成字符串,如果不通过分词,无法直接生成正确的中文词云 #cut_text = " ".join(jieba.cut(comment_text)) cut_text = ' '.join(comment_text) d = path.dirname(__file__) # 当前文件文件夹所在目录 color_mask = imread("/Users/ajmd/Desktop/timg.jpeg") # 读取背景图片 cloud = WordCloud( #设置字体,不指定就会出现乱码 #font_path="/Users/ajmd/Desktop/1252935991/CUHEISJ.TTF", font_path="/Users/ajmd/Desktop/simsunttc/simsun.ttc", #font_path=path.join(d,'simsun.ttc'), #设置背景色 background_color='white', #词云形状 mask=color_mask, #允许最大词汇 max_words=2000, #最大号字体 max_font_size=40) word_cloud = cloud.generate(cut_text) # 产生词云 print("Done") word_cloud.to_file("./ads_cloud4.jpg") #保存图片 # 显示词云图片 plt.imshow(word_cloud) plt.axis('off') plt.show()
def get_entity_recognit_sentence(self, sentence, entity, dialogic_flag=False,input_filter=True,sentence_context=False): ''' 得到单句实体识别结果 :param sentence: 句子 :param entity: 实体存储类 :param dialogic_flag: 对话标记,True标识是新对话,False标识是原对话 :return: 更新后的实体存储类 ''' try: if input_filter: sentence = optimize_func.input_filter(sentence, 'sentence') # 利用地址库识别地址 NE_dic = Entity_recognition.NE.sent_extract(sentence, dialogic_flag) entity.ID = NE_dic["ID"] entity.money = NE_dic["money"] if config.address_lib: entity = optimize_func.address_search(entity, sentence) if sentence_context: sentence = optimize_func.sentence_history(sentence, dialogic_flag) # print('***********************') # print("sentence ", sentence) # 命名实体识别 words, ners = fool.analysis(sentence) # print('ners:',ners) Logger.log_DEBUG.debug('实体识别结果') Logger.log_DEBUG.debug(ners) if len(ners[0]) >= 1: for ne in ners[0]: ne3_temp = ne[3].replace(' ', '') if ne3_temp in entity.location: continue if ne[2] == 'time' and ne3_temp not in entity.time: entity.time.append(ne3_temp) elif ne[2] == 'location' and ne3_temp not in entity.location: ne3_temp_fil = ne3_temp.replace('呃','').replace('那个','').replace('哪个','').replace('啊','') if len(ne3_temp_fil)>1: entity.location.append(ne3_temp_fil) elif ne[2] == 'person' and ne3_temp not in entity.person: ne3_temp_fil = ne3_temp.replace('呃', '').replace('那个', '').replace('哪个', '').replace('啊', '') if len(ne3_temp_fil)>1: entity.person.append(ne3_temp_fil) elif ne[2] == 'company' and ne3_temp not in entity.company: ne3_temp_fil = ne3_temp.replace('呃', '').replace('那个', '').replace('哪个', '').replace('啊', '') if len(ne3_temp_fil)>1: entity.company.append(ne3_temp_fil) # print('entity.location:',entity.location) # print('***********************') if config.is_filter: entity = optimize_func.filter_result(sentence,entity) return entity except Exception as e: s = "得到单句实体识别结果时发生异常get_entity_recognit_sentence" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s)
def get_all_relevant_things(doc_dict: dict): """ find relevant person and relevant organization for all docs args: doc_dict: dict, {'docid': 'docline', ...} return: relevant_person: dict, {'docid': {'person': num shown, ...}, ...} relevant_org: dict, {'docid': {'org': num shown, ...}, ...} """ relevant_person = dict() relevant_org = dict() c = 0 for docid in doc_dict: c += 1 if c % 500 == 0: print(c, "docs done at", time.asctime(time.localtime(time.time()))) doc_line = doc_dict[docid] _docid, doc_title, _doc_date, _doc_url, doc_text = doc_line.split( '|||') _words, ners = fool.analysis([doc_title, doc_text]) ners = ners[0] + ners[1] person_list = [t4[3] for t4 in ners if t4[2] == 'person'] org_list = [t4[3] for t4 in ners if t4[2] == 'org'] relevant_person[docid] = Counter(person_list) relevant_org[docid] = Counter(org_list) return relevant_person, relevant_org
def slot_fill(sentence, key=None): slot = {} words, ners = fool.analysis(sentence) to_city_flag = 0 for ner in ners[0]: if ner[2]=='time': date_content = re.findall(r'后天|明天|今天|大后天|周末|周一|周二|周三|周四|周五|周六|周日|本周一|本周二|本周三|本周四|本周五|本周六|本周日|下周一|下周二|下周三|下周四|下周五|下周六|下周日|这周一|这周二|这周三|这周四|这周五|这周六|这周日|\d{,2}月\d{,2}号|\d{,2}月\d{,2}日', ner[-1]) slot["date"] = date_content[0] if date_content else "" time_content = re.findall(r'\d{,2}点\d{,2}分|\d{,2}点钟|\d{,2}点', ner[-1]) pmam_content = re.findall(r'上午|下午|早上|晚上|中午|早晨', ner[-1]) slot["time"] = pmam_content[0] if pmam_content else "" + time_content[0] if time_content else "" if ner[2]=='location': if key is None: if re.findall(r'(到|去|回|回去)%s'%(ner[-1]), sentence): to_city_flag = 1 slot["to_city"] = ner[-1] continue if re.findall(r'从%s|%s出发'%(ner[-1], ner[-1]), sentence): slot["from_city"] = ner[-1] elif to_city_flag==1: slot["from_city"] = ner[-1] elif key in ["from_city", "to_city"]: slot[key] = ner[-1] return slot
def get_keywords(line): ''' 获取关键词 ''' fool.analysis load_dict('F:\\114代码\\i\\wordSegment\\kw.txt') word, keywords = fool.analysis(line) return [keyword[3] for keyword in keywords[0]]
def tcut(): text = "我在北京天安门" words, ners = fool.analysis(text) print(ners) words = fool.pos_cut(text) print(words) fool.delete_userdict() print(fool.cut(text))
def fool_recognize(text): # ners = fool.ner(text) words, ners = fool.analysis(text) ners = ners[0] # print(ners) # ners = [(x[0], x[1], x[2], x[3].strip()) for x in ners] return ners
def placeSingle(text): placeList = [] placeLists = [] provinceList = [] longitudeList = [] latitudeList = [] words, ners = fool.analysis(text) for itemSun in ners[0]: if itemSun[2] == 'location': title_str = [itemSun[3]] df = cpca.transform(title_str, cut=False) placenmpy = df.values places = placenmpy[0][0] + placenmpy[0][1] + placenmpy[0][2] #发生地点 if places != '': if placenmpy[0][0] == placenmpy[0][1]: places = placenmpy[0][1] + placenmpy[0][2] placeList.append(places) provinceList.append(placenmpy[0][0]) else: places = geocoder.arcgis(itemSun[3]) if places.latlng == None: continue placeLists.append(places.address) longitudeList.append(str(round(places.lng, 2))) latitudeList.append(str(round(places.lat, 2))) if placeList != []: placedomestic = placeList.count(max(placeList, key=placeList.count)) else: placedomestic = 0 if placeLists != []: placeforeign = placeLists.count(max(placeLists, key=placeLists.count)) else: placeforeign = 0 if placedomestic == 0 and placeforeign == 0: place = '' longitude = '' latitude = '' province = '' country = '' return place, longitude, latitude, province, country if placedomestic >= placeforeign: place = max(placeList, key=placeList.count) llat = geocode(place) longitude = llat[0] #地点经度 latitude = llat[1] indexdata = placeList.index(place) country = '中国' province = provinceList[indexdata] place = '中国' + place return place, longitude, latitude, province, country else: place = max(placeLists, key=placeLists.count) indexdata = placeLists.index(place) country = '' province = '' longitude = longitudeList[indexdata] latitude = latitudeList[indexdata] return place, longitude, latitude, province, country
def fun_clean(self, sentence): words, ners = fool.analysis(sentence) ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True) if ners: for ner in ners: sentence = sentence.replace(ner[-1], ' ' + ner[2] + ' ')
def slot_fill(sentence, key=None): """ 填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作) :param sentence: :return slot: (填槽的结果) """ slot = {} # 进行实体识别 words, ners = fool.analysis(sentence) print(ners) to_city_flag = 0 # flag为1代表找到到达城市(作用:当找到到达城市时,默认句子中另一个城市信息是出发城市) for ner in ners[0]: # 首先对time类别的实体进行信息抽取填槽工作 if ner[2] == 'time': # -------------------- # 寻找日期的关键词 date_content = re.findall( r'后天|明天|今天|大后天|周末|周一|周二|周三|周四|周五|周六|周日|本周一|本周二|本周三|本周四|本周五|本周六|本周日|下周一|下周二|下周三|下周四|下周五|下周六|下周日|这周一|这周二|这周三|这周四|这周五|这周六|这周日|\d{,2}月\d{,2}号|\d{,2}月\d{,2}日', ner[-1]) slot["date"] = date_content[0] if date_content else "" # 完成日期的填槽 # -------------------- # -------------------- # 寻找具体时间的关键词 time_content = re.findall(r'\d{,2}点\d{,2}分|\d{,2}点钟|\d{,2}点', ner[-1]) # 寻找上午下午的关键词 pmam_content = re.findall(r'上午|下午|早上|晚上|中午|早晨', ner[-1]) slot["time"] = pmam_content[ 0] if pmam_content else "" + time_content[ 0] if time_content else "" # 完成时间的填槽 # -------------------- # 对location类别对实体进行信息抽取填槽工作 if ner[2] == 'location': # -------------------- # 开始对城市填槽 # 如果没有指定槽位 if key is None: if re.findall(r'(到|去|回|回去)%s' % (ner[-1]), sentence): to_city_flag = 1 slot["to_city"] = ner[-1] continue if re.findall(r'从%s|%s出发' % (ner[-1], ner[-1]), sentence): slot["from_city"] = ner[-1] elif to_city_flag == 1: slot["from_city"] = ner[-1] # 如果指定了槽位 elif key in ["from_city", "to_city"]: slot[key] = ner[-1] # 完成出发城市、到达城市的填槽工作 # -------------------- return slot
def fool_ner(text): # 识别机构名-foolnltk 返回公司名列表 words, ners = fool.analysis(text) company = [] for ner_list in ners: for tups in ner_list: if tups[2] == 'company': temp = re.sub(' ', '', tups[3]) company.append(temp) company = list(set(company)) return company
def processSentence(sentence): #print(fool.cut(sentence)) #print(fool.pos_cut(sentence)) try: print(fool.cut(sentence)) print(fool.pos_cut(sentence)) words, ners = fool.analysis(sentence) print(words,ners) except: pass
def location_parse(question): # 实体识别(地区) words, ners = fool.analysis(question) location_list = [] for i in ners: for j in i: if j[2] == 'location' and j[3] in province: location_list.append(['domestic', j[3]]) if j[2] == 'location' and j[3] in foreign_country: location_list.append(['foreign', j[3]]) return location_list
def who_answer(s, flag): words, ners = fool.analysis(s) ansml = '' for ner in ners[0]: if flag == 1: if 'person' in ner[2]: return ner[3] else: if 'person' in ner[2]: ansml = ansml + ner[3] if 'org' in ner[2]: ansml = ansml + ner[3] if 'company' in ner[2]: ansml = ansml + ner[3] return ansml
def slot_fill(sentence, key=None): # 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作) # input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息) # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值) slot = {} # 进行实体识别 words, ners = fool.analysis(sentence) """ TODO:从sentence中寻找需要的内容,完成填槽工作 """ return slot
def find_full_name(soup): # 找出公告发出公司 soupcontent = re.sub('\n|\s', '', soup.get_text()) _, nt = fool.analysis(soupcontent[0:110]) nt = nt[0] fullname_ls = [x[3] for x in nt if x[2] == 'company' or x[2] == 'org'] fullname_ls_len = [len(x) for x in fullname_ls[:min(2, len(fullname_ls))]] if len(fullname_ls_len): fullname = fullname_ls[:max(2, len(fullname_ls))][ fullname_ls_len.index(max(fullname_ls_len))] if fullname: return (fullname) else: return ('公告里没有公司') return ('公告里没有公司')
def analyzeInfo(item): address = '' result = {} a_list = item.find_all('a', limit=1) h3_list = item.find_all('h3', attrs={'class': 'tit'}, limit=1) span_link_list = h3_list[0].find_all('span', limit=1) div_list = item.find_all('div', attrs={'class': 'src-tim'}, limit=1) span1_list = div_list[0].find_all('span', attrs={'class': 'src'}, limit=1) span2_list = div_list[0].find_all('span', attrs={'class': 'tim'}, limit=1) dataCount = int(mysqlCommand.getLastId()) + 1 result['id'] = str(dataCount) result['disasterid'] = '0008' #类别:泥石流 result['link'] = span_link_list[0]['lanmu1'] result['title'] = a_list[0].get_text().strip() #新闻标题 result['releaseTime'] = re.sub("\D", "", span2_list[0].get_text().strip()) #发布时间 result['source'] = span1_list[0].get_text().strip().replace('来源:', '') #新闻来源 result['originalText'] = get_original(result['link']) #新闻原文 title_str = [result['originalText']] words, ners = fool.analysis(title_str) for itemSun in ners[0]: if itemSun[2] == 'location': if itemSun[3] in address: break else: address = address + itemSun[3] + ',' result['place'] = address #发生地点 result['longitude'] = '0' #地点经度 result['latitude'] = '0' #地点纬度 result['strength'] = '' #灾害强度 result['occurTime'] = '' #发生时间 originalText = result['originalText'] + ',' + result['title'] death = toYc.death(originalText) injured = toYc.Injured(originalText) result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['loss'] = '0' #经济损失 result['pictures'] = '' #多个路径之间用分号隔开 result['more'] = '' #特殊字段 try: # 插入数据,如果已经存在就不在重复插入 title = 'debrisFlow' res = mysqlCommand.insertData(result, title) if res: dataCount = res except Exception as e: print("插入数据失败", str(e)) #输出插入失败的报错语句
def query(text): global datas for i in fool.analysis(text)[1]: for j in i: for i in search(j[3]): add_data(j[3], i[0], i[1], g) ss = sqlsent02.sqlsent(text) #print(ss,'???') for i in ss: if '的' in i: ss[ss.index(i)] = query(i + '是什么') #print(ss,'--') ss[ss.index('L')] = '?x' q = "select ?x where { " + vbose(ss[0]) + " ?x ?y}" x = g.query(q) t = 0 st = '' for i in list(x): simi = SentAna.vector_similarity(abbr(i[0]), ss[1]) if simi >= t: st = abbr(i[0]) t = simi ss[1] = st #逻辑推导 for subj, pred, obj in g: #从RDF取出三元组 assert_fact("relation", abbr(subj), abbr(pred), abbr(obj)) #加入Datalog数据库 load("relation(X,'爷爷',Z) <= relation(X,'父亲',Y) & relation(Y,'父亲',Z)" ) #定义推理表达式 load("relation(Y,'孙子',X) <= relation(X,'爷爷',Y)") load("relation(Y,'丈夫',X) <= relation(X,'妻子',Y)") load("relation(Y,'儿子',X) <= relation(X,'父亲',Y)") load("relation(Y,'奶奶',Z) <= relation(Y,'爷爷',X)& relation(X,'配偶',Z)") load("relation(X,'儿媳',Z) <= relation(X,'孙子',Y)& relation(Y,'母亲',Z)") load("relation(X,'亲属',Y) <= relation(X,'孙子',Y)") load("relation(X,'亲属',Y) <= relation(X,'母亲',Y)") load("relation(X,'亲属',Y) <= relation(X,'奶奶',Y)") load("relation(X,'亲属',Y) <= relation(X,'儿子',Y)") load("relation(X,'亲属',Y) <= relation(X,'爷爷',Y)") load("relation(X,'亲属',Y) <= relation(X,'父亲',Y)") load("relation(X,'亲属',Y) <= relation(Y,'亲属',X)") for i in relation(X, R, Y): add_data(i[0], i[1], i[2], g) q = "select ?x where { " + vbose(ss[0]) + ' ' + vbose(ss[1]) + ' ' + vbose( ss[2]) + "}" x = g.query(q) datas = [abbr(i[0]) for i in x] return abbr(list(x)[0][0])
def handle_article(self, article, create_time, crawling_time, mongo_id): article = ''.join(article) words, ner = fool.analysis(article) # item_msg 就是代表每一条消息(起始位置, 结束位置, 类型, 内容) for item_msg in ner[0]: if item_msg[2] in self.type_list: type_word = list(item_msg) if type_word[2] == 'company': type_word[2] = 'org' elif type_word[2] == 'location': type_word[2] = 'place' ner_data = str(type_word[3]) r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+' ner_text = re.sub(r, '', ner_data) print(type_word[2] + 'NER' + ner_text + '\t' + 'create_time' + create_time + '&' + crawling_time + 'ID:' + mongo_id)
def deal(sets): rdfs = [] if 'SBV' in sets.keys() and 'ATT' in sets.keys(): rdfs.append((sets['ATT'][0], sets['ATT'][1], sets['SBV'][0])) print(sets['ATT'][0], sets['ATT'][1], sets['SBV'][0]) if 'SBV' in sets.keys() and 'VOB' in sets.keys(): rdfs.append((sets['SBV'][0], sets['VOB'][1], sets['VOB'][0])) print(sets['SBV'][0], sets['VOB'][1], sets['VOB'][0]) if 'ATT' in sets.keys() and 'VOB' in sets.keys(): if sets['ATT'][1] != sets['VOB'][0]: rdfs.append((sets['ATT'][0], sets['ATT'][1], sets['VOB'][0])) print(sets['ATT'][0], sets['ATT'][1], sets['VOB'][0]) if 'SBV' in sets.keys() and 'POB' in sets.keys(): if fool.analysis(sets['POB'][0])[1][0][0][2] != 'time': rdfs.append((sets['SBV'][0], sets['SBV'][1], sets['POB'][0])) print(sets['SBV'][0], sets['SBV'][1], sets['POB'][0]) return rdfs
def slot_fill(sentence, key=None): # 函数目标:填槽函数(该函数从sentence中寻找需要的内容,完成填槽工作) # input:sentence(用户输入), key(指定槽位,只对该句话提取指定槽位的信息) # output:slot(返回填槽的结果,以json格式返回,key为槽位名,value为值) slot = {} # 进行实体识别 words, ners = fool.analysis(sentence) to_city_flag = 0 for ner in ners[0]: if ner[2] == 'time': date_content = re.findall( r'后天|明天|今天|大后天|周末|周一|周二|周三|周四|周五|周六|周日|本周一|本周二|本周三|本周四|本周五|本周六|本周日|下周一|下周二|下周三|下周四|下周五|下周六|下周日|这周一|这周二|这周三|这周四|这周五|这周六|这周日|\d{,2}月\d{,2}号|\d{,2}月\d{,2}日', ner[-1]) slot["date"] = date_content[0] if date_content else "" # 寻找具体时间的关键词 time_content = re.findall(r'\d{,2}点\d{,2}分|\d{,2}点钟|\d{,2}点', ner[-1]) pmam_content = re.findall(r'上午|下午|早上|晚上|中午|早晨', ner[-1]) slot["time"] = pmam_content[ 0] if pmam_content else "" + time_content[ 0] if time_content else "" if ner[2] == 'location': if key is None: if re.findall(r'(到|去|回|回去)%s' % (ner[-1]), sentence): to_city_flag = 1 slot["to_city"] = ner[-1] continue if re.findall(r'从%s|%s出发' % (ner[-1], ner[-1]), sentence): slot["from_city"] = ner[-1] elif to_city_flag == 1: slot["from_city"] = ner[-1] # 如果指定了槽位 elif key in ["from_city", "to_city"]: slot[key] = ner[-1] # 完成出发城市、到达城市的填槽工作 """ TODO:从sentence中寻找需要的内容,完成填槽工作 """ return slot
def tages(x): q = fool.analysis(x) s = [i[3] for i in q[1][0] if i[2] == 'company'] #if re.findall('央|行|证券|德普|吉恩|期货|酒|美联社|路透社|伊朗|普京|律师|基金|白马股|代写投标书|[a-zA-Z]|\*|[\xa0\u3000]',str(s)): # return None #elif re.findall('[[\u4e00-\u9fa5]{0,25}公司]',str(s)): #return set(s) for i in range(len(s)): s[i] = s[i].replace(' ', '').replace('\u3000', '') com = [] for i in s: if re.findall('[\u4e00-\u9fa5][^0-9a-zA-Z_]{0,25}公司', str(i)): com.append(i) if len(set(com)) > 2 and len(set(com)) < 8: return set(com)
def fun_clean(self, sentence): """ 预处理函数 :输入 用户输入语句: :输出 预处理结果: """ # 使用foolnltk进行实体识别 words, ners = fool.analysis(sentence) # 对识别结果按长度倒序排序 ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True) # 如果有实体被识别出来,就将实体的字符串替换成实体类别的字符串(目的是看成一类单词,看成一种共同的特征) if ners: for ner in ners: sentence = sentence.replace(ner[-1], ' ' + ner[2] + ' ') # 分词,并去除停用词 word_lst = [w for w in fool.cut(sentence)[0] if w not in stopwords] output_str = ' '.join(word_lst) output_str = re.sub(r'\s+', ' ', output_str) return output_str.strip()
def location_parse(question): # 实体识别(地区) words, ners = fool.analysis(question) location_list = [] for i in ners: for j in i: if j[2] == 'location' and j[3] in province: location_list.append(['domestic', j[3]]) if j[2] == 'location' and j[3] in foreign_country: location_list.append(['foreign', j[3]]) if j[2] == 'location' and j[3] in continent: location_list.append(['continent', j[3]]) if j[2] == 'location' and j[3] == '中国': location_list.append(['china', j[3]]) words = jieba.cut(question.encode('utf-8')) for word in words: if word in foreign_country: location_list.append(['foreign', word]) return location_list
#!/usr/bin/env python # -*-coding:utf-8-*- import fool text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"] print("no dict:", fool.cut(text, ignore=True)) fool.load_userdict("./test_dict.txt") print("use dict: ", fool.cut(text)) fool.delete_userdict() print("delete dict:", fool.cut(text)) pos_words =fool.pos_cut(text) print("pos result", pos_words) words, ners = fool.analysis(text) print("ners: ", ners) ners = fool.ner(text) print("ners:", ners)