def bind_sentence_from_csv(self): self.di.clear() self.tree_di.clear() """bind sentence from csv""" df = self.gen_csv(self.filename) cnt = 0 for line in df: self.add_tree_route(utils.clr(line)) self.bind_word_with_sentence(utils.clr(line)) #self.bind_word_with_sentence(self.before_first_num(line.strip())) cnt += 1 if cnt % 1000 == 1: print("we have bind ", cnt, " sentence")
def init_num_hash(self): stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:, 1] for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) nums = list(re.findall(RE_NUMS, line))
def bind_word_with_sentence(self, sentence): """connect sentence to its every words""" #将词和句子建立图节点与边 #c = 我爱北京 (c,我)(c,爱)(c,北京) sentence = utils.clr(sentence) h = hash(sentence) #把句子hash append_lst = [] #for word in jieba.cut(self.before_first_num(sentence)):#weight是权重,作用在于把信息量大的词找出来 for word in jieba.cut(sentence): #weight是权重,作用在于把信息量大的词找出来 append_lst.append(word) if " " in append_lst: append_lst.remove(" ") #if len(append_lst)>1: # self.tree_di.add_edge(append_lst[-2],append_lst[-1],{"weight":1}) if str(h) in self.clus_node and word in self.clus_node: res = self.di.get_edge_data(word, str(h)) if not res == None: wv = res[ 'weight'] #为图拓扑结构添加边,weight是权重,如果边已经存在则权重加1,也就是句子里两个相同的词,权重为2 wv += 1 self.di[word][str(h)]['weight'] = wv continue self.di.add_edge(word, str(h)) self.di[word][str(h)]['weight'] = 1 self.sent[str(h)] = sentence #存到词典里用于后续还原 self.clus_node.add(word) self.clus_node.add(str(h))
def query(self): df = pd.DataFrame() df['map'] = "" df['kw'] = "" df['target'] = "" input_file = [] cnt=0 for _,_,docs in os.walk(TEST_PATH): for doc in docs: lines = open(os.path.join(TEST_PATH, doc)).readlines() #lines = pd.read_csv(os.path.join(TEST_PATH, doc)).iloc[:,1] lines = [lines[np.random.randint(len(lines))] for i in range(self.test_batch)] for line in lines: line = utils.clr(line) print(line) result,res = self._query_one(line) #result = self.addr_tree.words_route(res) if len(result) == 0: df.loc[str(cnt),'map'] = line df.loc[str(cnt),'target'] = "".join([]) df.loc[str(cnt),'kw'] = ",".join(res) cnt+=1 continue else: for parent_res in result: print(line, parent_res) df.loc[str(cnt),'map'] = line df.loc[str(cnt),'target'] = "ROOT"+parent_res df.loc[str(cnt),'kw'] = ",".join(res) cnt+=1 df.to_csv("./record.csv") print(cnt, 'save')
def read_txt(filename,shuffle): lines = codecs.open(filename,"r","utf-8").readlines() for line in lines: if shuffle: line = lines[np.random.randint(len(lines))] line = line.split("&")[0] line = utils.clr(line) yield line
def init_num_hash(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) nums = list(re.findall("\d+", line)) self.num_tree.insert_num_lst(nums, hash(line))
def init_ner_train_data(filename): gen = read_txt(filename,shuffle=True) f = open(filename,"a+") for sent in gen: sent = utils.clr(sent) for char in sent: f.write("%s O\n"%char) f.write("\n") f.close()
def init_model(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1] stand_lines = [stand_lines[np.random.randint(len(stand_lines))] for i in range(self.batch)] for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) words = list(jieba.cut(line)) self.addr_tree.insert_wd_lst(words) for word in words: self.dict_tree.insert(word)
def _query_one(self, line): """ 入口接口 line 待比对内容,为一行文本 地址可以看作三个部分 文本部分 数字部分 和 量纲部分 """ line = utils.clr(line) output, res, score = [], [], [] #min_edit_value = 9999 #最小编辑距离 my_txt = utils.without_num(line) #取出第一个数字前的文本部分 my_num = re.findall(RE_NUMS, line) #取出数字部分 result = self.route_text(my_txt, my_num) #根据文本部分和数字部分完成索引 return result
def _query_one(self, line): """ 入口接口 line 待比对内容,为一行文本 地址可以看作三个部分 文本部分 数字部分 和 量纲部分 """ line = utils.clr(line) output,res,score = [],[],[] my_txt = utils.without_num(line) my_num = re.findall(RE_NUMS,line) result = self.route_text(my_txt,my_num) return result
def prehand_one(self, line): """ 入口接口 line 待比对内容,为一行文本 地址可以看作三个部分 文本部分 数字部分 和 量纲部分 """ line = utils.clr(line) output, res, score = [], [], [] #min_edit_value = 9999 #最小编辑距离 my_txt = utils.without_num(line) #取出第一个数字前的文本部分 my_num = re.findall(RE_NUMS, line) #取出数字部分 result = self.pre_route_text(my_txt) #根据文本部分和数字部分完成索引 one_piece = "%s&%s&%s&%s" % (line, my_txt, ",".join(my_num), result) return one_piece
def route_text(self,line,lst): """key algor the search algor""" line = utils.clr(str(line)) """filter left the text word""" """how to filter use the dict-tree""" res = self.word_filter(line) key_word_dict = {} logger.debug("过滤后词组" + ",".join(res)) for word in res: key_word_dict[word] = self.graph.di.degree()[word] sorted_key_word_dict = sorted(key_word_dict.items(),key=lambda d:d[1],reverse=False) key_word_lst = [word[0] for word in sorted_key_word_dict] with open("key_word_lst.txt","a+") as g: g.write(",".join(key_word_lst)+"\n") return set()
def pre_route_text(self,line): """key algor the search algor""" line = utils.clr(str(line)) """filter left the text word""" """how to filter use the dict-tree""" res = self.word_filter(line) key_word_dict = {} logger.debug("过滤后词组" + ",".join(res)) for word in res: if not word in self.nodes: continue key_word_dict[word] = self.degree[word] sorted_key_word_dict = sorted(key_word_dict.items(),key=lambda d:d[1],reverse=False) key_word_lst = [word[0] for word in sorted_key_word_dict] key_word_lst_sorted = ",".join(key_word_lst) return key_word_lst_sorted
def _route_text(self, line, lst): """key algor the search algor""" line = utils.clr(str(line)) """filter left the text word""" """how to filter use the dict-tree""" res = self.word_filter(line) words_route = [] if " " in res: res.remove(" ") key_word_dict = {} for word in res: #pdb.set_trace() key_word_dict[word] = self.graph.di.degree()[word] sorted_key_word_dict = sorted(key_word_dict.items(), key=lambda d: d[1], reverse=False) key_word_lst = [word[0] for word in sorted_key_word_dict] neighbor = [] for cursor in range(len(key_word_lst)): p_wd = key_word_lst[cursor] """get the common neighbors one by one when there is a word has no neighbors, continue""" """if there is a set of common_neighbor, & the set with last one""" print(p_wd, time.time()) tmp_neighbor = utils.get_sent_from_word(self.redis, p_wd) if len(neighbor) == 0: neighbor.append(tmp_neighbor) if len(tmp_neighbor) > 0: if len(neighbor) > 0: tmp = neighbor[-1] & tmp_neighbor if len(neighbor[-1]) == len(tmp): print("查询到高级词召回数量没有变化", len(tmp)) break if len(tmp) > 0: print("查询到高级词召回数量没有变化", len(tmp)) break if len(tmp) == 0: continue else: neighbor[-1] = tmp else: continue if len(neighbor) == 0: """there is no neighor here""" return [] else: return list(neighbor[-1])
def handle_text(self,line): line = utils.clr(str(line)) line_pre = utils.before_first_num(line) res = self.word_filter(line_pre) comm_nbs = [] for i in range(len(res)-2): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) except: print("networkx error") continue #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) result = self.common_nbs(comm_nbs) return result,res
def add_tree_route(self, sentence): sentence = utils.clr(sentence) wdlst = [] #for word in jieba.cut(self.before_first_num(sentence)): for word in jieba.cut(sentence): wdlst.append(word) if len(wdlst) > 1: if wdlst[-2] in self.tree_clus_node and wdlst[ -1] in self.tree_clus_node: res = self.tree_di.get_edge_data(wdlst[-2], wdlst[-1]) if not res == None: wv = res['weight'] wv += 1 self.tree_di[wdlst[-2]][wdlst[-1]]['weight'] = wv continue self.tree_di.add_edge(wdlst[-2], wdlst[-1]) self.tree_di[wdlst[-2]][wdlst[-1]]["weight"] = 1 self.tree_clus_node.add(wdlst[-2]) self.tree_clus_node.add(wdlst[-1])
def route_text(self, line, lst): """key algor the search algor""" line = utils.clr(str(line)) """filter left the text word""" """how to filter use the dict-tree""" res = self.word_filter(line) key_word_dict = {} logger.debug("过滤后词组" + ",".join(res)) for word in res: key_word_dict[word] = self.graph.di.degree()[word] sorted_key_word_dict = sorted(key_word_dict.items(), key=lambda d: d[1], reverse=False) key_word_lst = [word[0] for word in sorted_key_word_dict] with open("key_word_lst.txt", "a+") as g: g.write(",".join(key_word_lst) + "\n") return set() words_route = [] neighbor = [] logger.debug("排序后词组" + ",".join(key_word_lst)) for cursor in range(len(key_word_lst)): p_wd = key_word_lst[cursor] neighbor.append(p_wd) if len(neighbor) > 1: tmp_neighbor = utils.get_common_neighbor( self.redis, neighbor[-2], neighbor[-1]) if len(tmp_neighbor) == 0: continue else: words_route.append(tmp_neighbor) return words_route[-1] """ if len(words_route)>0: tmp = tmp_neighbor & words_route[-1] if len(tmp)>0: words_route[-1] = tmp else: continue else: words_route.append(tmp_neighbor) """ return words_route[-1] if len(words_route) > 0 else set()
def seperate_zhengz_address(filename): rt = open("/home/dell/data/zhengz_train.txt","w+") wx = open("/home/dell/data/zhengz_dev.txt","w+") tmp = [] with open(filename) as f: lines = f.readlines() for line in lines: line = re.sub("[\r\n]","",line) line = re.sub("NONE","",line) line = re.sub(" ","",line) line = utils.clr(line) if 'ROOT' in line: qua,ans = line.split('ROOT') rt.write("%s %s 0\n"%(qua,ans)) else: if len(tmp) == 2: rt.write("%s %s 1\n"%(tmp[0],tmp[1])) tmp = [] else: tmp.append(line) rt.close() wx.close()
def handle_text(line): """celery -A tasks worker -Q handle_text --concurrency=4 -l info -E -n worker1@%h""" line = utils.clr(str(line)) line_pre = utils.before_first_num(line) res = address_activity.word_filter(line_pre) comm_nbs = [] for i in range(len(res) - 2): print(res) try: comm_nbs.append( list( nx.common_neighbors(address_activity.graph.di, res[i], res[i + 1]))) except: print("networkx error") continue comm_nbs.append( list( nx.common_neighbors(address_activity.graph.di, res[i], res[i + 1]))) result = address_activity.common_nbs(comm_nbs) return result, res
def query_one(self, line): line = utils.clr(str(line)) line_pre = utils.before_first_num(line) #fir_num= utils.first_numbers(line) res = self.word_filter(line_pre) res.extend(utils.numbers(line)) comm_nbs = [] for i in range(len(res)-1): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) except: print("networkx error") continue #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) result = self.common_nbs(comm_nbs) _result = [] for i in result: if not self.check_num(self.graph.sent[i],line): continue _result.append(self.graph.sent[i]) return _result,res
def route_text(self, line, lst): print("过滤掉无用文本 ", line, lst) line = utils.clr(str(line)) #line_pre = utils.before_first_num(line) res = self.word_filter(line) print("经过过滤的词条", res) #res.extend(lst) words_route = [] comm_nbs = [] if len(res) == 1: res.extend(res) for i in range(len(res) - 1): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) p_node = res[i] a_node = res[i + 1] if len(words_route) == 0: words_route.append(p_node) try: route = nx.shortest_path(self.graph.tree_di, words_route[-1], a_node) print('是否存在最短路径 ', route) words_route.append(a_node) print("add node", i, a_node) #weight = self.graph.tree_di[words_route[-1]][a_node]['weight'] #weight = self.graph.tree_di[words_route[-1]][a_node]['weight'] except: print( "not connect direct, continue, find the next one, utile to the head of words lst" ) print("过滤复杂文本的词条") #words_route = words_route[:-1] #words_route.append(a_node) continue except: print("networkx error") continue #words_route = words_route[::-1] print("复杂文本", res) print("过滤输出", words_route) if " " in words_route: words_route.remove(" ") if len(words_route) > 0: words_route.insert(0, words_route[0]) for i in range(len(words_route)): try: comm_nbs.extend( list(nx.all_neighbors(self.graph.di, words_route[i]))) except: print("添加邻居出错") print("所有的邻居都添加到列表中,等待计算") print("列表中共有多少个item", len(comm_nbs)) cnt_lst = collections.Counter(comm_nbs) sorted_lst = sorted(cnt_lst.items(), key=lambda d: d[1], reverse=True) if not len(sorted_lst) > 0: return [], words_route max_value = sorted_lst[0][1] #result = self.common_nbs(comm_nbs) #result = self.common_nbs(comm_nbs) result = filter(lambda x: utils.is_max(x, max_value), sorted_lst) result = [i[0] for i in result] print("一共有多少个句子", len(result)) print("公共邻居最多的句子", self.graph.sent[result[0]]) print("公共邻居最少的句子", self.graph.sent[result[-1]]) print("最终关键词", words_route) return result, words_route
def gen_csv(self, filename): df = pd.read_csv(filename) for i in df.iloc[:, 1]: yield utils.clr(str(i).strip())
def gen_txt(self, filename): f = open(filename, 'r') lines = f.readlines() for i in lines: yield utils.clr(str(i).strip())