コード例 #1
0
ファイル: my_graph.py プロジェクト: scmsqhn/code
 def bind_sentence_from_csv(self):
     self.di.clear()
     self.tree_di.clear()
     """bind sentence from csv"""
     df = self.gen_csv(self.filename)
     cnt = 0
     for line in df:
         self.add_tree_route(utils.clr(line))
         self.bind_word_with_sentence(utils.clr(line))
         #self.bind_word_with_sentence(self.before_first_num(line.strip()))
         cnt += 1
         if cnt % 1000 == 1:
             print("we have bind ", cnt, " sentence")
コード例 #2
0
 def init_num_hash(self):
     stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:, 1]
     for line in stand_lines:
         """ insert address into addr_tree """
         """ insert all addr_tree """
         line = utils.clr(line)
         nums = list(re.findall(RE_NUMS, line))
コード例 #3
0
 def bind_word_with_sentence(self, sentence):
     """connect sentence to its every words"""
     #将词和句子建立图节点与边
     #c = 我爱北京 (c,我)(c,爱)(c,北京)
     sentence = utils.clr(sentence)
     h = hash(sentence)  #把句子hash
     append_lst = []
     #for word in jieba.cut(self.before_first_num(sentence)):#weight是权重,作用在于把信息量大的词找出来
     for word in jieba.cut(sentence):  #weight是权重,作用在于把信息量大的词找出来
         append_lst.append(word)
         if " " in append_lst:
             append_lst.remove(" ")
         #if len(append_lst)>1:
         #    self.tree_di.add_edge(append_lst[-2],append_lst[-1],{"weight":1})
         if str(h) in self.clus_node and word in self.clus_node:
             res = self.di.get_edge_data(word, str(h))
             if not res == None:
                 wv = res[
                     'weight']  #为图拓扑结构添加边,weight是权重,如果边已经存在则权重加1,也就是句子里两个相同的词,权重为2
                 wv += 1
                 self.di[word][str(h)]['weight'] = wv
                 continue
         self.di.add_edge(word, str(h))
         self.di[word][str(h)]['weight'] = 1
         self.sent[str(h)] = sentence  #存到词典里用于后续还原
         self.clus_node.add(word)
         self.clus_node.add(str(h))
コード例 #4
0
 def query(self):
     df = pd.DataFrame()
     df['map'] = ""
     df['kw'] = ""
     df['target'] = ""
     input_file = []
     cnt=0
     for _,_,docs in os.walk(TEST_PATH):
         for doc in docs:
             lines = open(os.path.join(TEST_PATH, doc)).readlines()
             #lines = pd.read_csv(os.path.join(TEST_PATH, doc)).iloc[:,1]
             lines = [lines[np.random.randint(len(lines))] for i in range(self.test_batch)]
             for line in lines:
                 line = utils.clr(line)
                 print(line)
                 result,res = self._query_one(line)
                 #result = self.addr_tree.words_route(res)
                 if len(result) == 0:
                     df.loc[str(cnt),'map'] = line
                     df.loc[str(cnt),'target'] = "".join([])
                     df.loc[str(cnt),'kw'] = ",".join(res)
                     cnt+=1
                     continue
                 else:
                     for parent_res in result:
                         print(line, parent_res)
                         df.loc[str(cnt),'map'] = line
                         df.loc[str(cnt),'target'] = "ROOT"+parent_res
                         df.loc[str(cnt),'kw'] = ",".join(res)
                         cnt+=1
                 df.to_csv("./record.csv")
                 print(cnt, 'save')
コード例 #5
0
def read_txt(filename,shuffle):
    lines = codecs.open(filename,"r","utf-8").readlines()
    for line in lines:
        if shuffle:
            line = lines[np.random.randint(len(lines))]
        line = line.split("&")[0]
        line = utils.clr(line)
        yield line
コード例 #6
0
ファイル: address_activity.py プロジェクト: scmsqhn/code
 def init_num_hash(self):
     stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines()
     for line in stand_lines:
         """ insert address into addr_tree """
         """ insert all addr_tree """
         line = utils.clr(line)
         nums = list(re.findall("\d+", line))
         self.num_tree.insert_num_lst(nums, hash(line))
コード例 #7
0
def init_ner_train_data(filename):
    gen = read_txt(filename,shuffle=True)
    f = open(filename,"a+")
    for sent in gen:
        sent = utils.clr(sent)
        for char in sent:
            f.write("%s O\n"%char)
        f.write("\n")
    f.close()
コード例 #8
0
 def init_model(self):
     stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines()
     #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1]
     stand_lines = [stand_lines[np.random.randint(len(stand_lines))] for i in range(self.batch)]
     for line in stand_lines:
         """ insert address into addr_tree """
         """ insert all addr_tree """
         line = utils.clr(line)
         words = list(jieba.cut(line))
         self.addr_tree.insert_wd_lst(words)
         for word in words:
             self.dict_tree.insert(word)
コード例 #9
0
 def _query_one(self, line):
     """
     入口接口
     line 待比对内容,为一行文本
     地址可以看作三个部分
     文本部分 数字部分 和 量纲部分
     """
     line = utils.clr(line)
     output, res, score = [], [], []
     #min_edit_value = 9999 #最小编辑距离
     my_txt = utils.without_num(line)  #取出第一个数字前的文本部分
     my_num = re.findall(RE_NUMS, line)  #取出数字部分
     result = self.route_text(my_txt, my_num)  #根据文本部分和数字部分完成索引
     return result
コード例 #10
0
ファイル: address_network_init.py プロジェクト: scmsqhn/code
    def _query_one(self, line):
        """
        入口接口
        line 待比对内容,为一行文本
        地址可以看作三个部分
        文本部分 数字部分 和 量纲部分
        """
        line = utils.clr(line)
        output,res,score = [],[],[]

        my_txt = utils.without_num(line)
        my_num = re.findall(RE_NUMS,line)
        result = self.route_text(my_txt,my_num)
        return result
コード例 #11
0
 def prehand_one(self, line):
     """
     入口接口
     line 待比对内容,为一行文本
     地址可以看作三个部分
     文本部分 数字部分 和 量纲部分
     """
     line = utils.clr(line)
     output, res, score = [], [], []
     #min_edit_value = 9999 #最小编辑距离
     my_txt = utils.without_num(line)  #取出第一个数字前的文本部分
     my_num = re.findall(RE_NUMS, line)  #取出数字部分
     result = self.pre_route_text(my_txt)  #根据文本部分和数字部分完成索引
     one_piece = "%s&%s&%s&%s" % (line, my_txt, ",".join(my_num), result)
     return one_piece
コード例 #12
0
ファイル: address_network_init.py プロジェクト: scmsqhn/code
 def route_text(self,line,lst):
     """key algor the search algor"""
     line = utils.clr(str(line))
     """filter left the text word"""
     """how to filter use the dict-tree"""
     res = self.word_filter(line)
     key_word_dict = {}
     logger.debug("过滤后词组" + ",".join(res))
     for word in res:
         key_word_dict[word] = self.graph.di.degree()[word]
     sorted_key_word_dict = sorted(key_word_dict.items(),key=lambda d:d[1],reverse=False)
     key_word_lst = [word[0] for word in sorted_key_word_dict]
     with open("key_word_lst.txt","a+") as g:
         g.write(",".join(key_word_lst)+"\n")
     return set()
コード例 #13
0
ファイル: address_network_init.py プロジェクト: scmsqhn/code
 def pre_route_text(self,line):
     """key algor the search algor"""
     line = utils.clr(str(line))
     """filter left the text word"""
     """how to filter use the dict-tree"""
     res = self.word_filter(line)
     key_word_dict = {}
     logger.debug("过滤后词组" + ",".join(res))
     for word in res:
         if not word in self.nodes:
             continue
       key_word_dict[word] = self.degree[word]
     sorted_key_word_dict = sorted(key_word_dict.items(),key=lambda d:d[1],reverse=False)
     key_word_lst = [word[0] for word in sorted_key_word_dict]
     key_word_lst_sorted = ",".join(key_word_lst)
     return key_word_lst_sorted
コード例 #14
0
 def _route_text(self, line, lst):
     """key algor the search algor"""
     line = utils.clr(str(line))
     """filter left the text word"""
     """how to filter use the dict-tree"""
     res = self.word_filter(line)
     words_route = []
     if " " in res:
         res.remove(" ")
     key_word_dict = {}
     for word in res:
         #pdb.set_trace()
         key_word_dict[word] = self.graph.di.degree()[word]
     sorted_key_word_dict = sorted(key_word_dict.items(),
                                   key=lambda d: d[1],
                                   reverse=False)
     key_word_lst = [word[0] for word in sorted_key_word_dict]
     neighbor = []
     for cursor in range(len(key_word_lst)):
         p_wd = key_word_lst[cursor]
         """get the common neighbors one by one when there is a word has no neighbors, continue"""
         """if there is a set of common_neighbor, & the set with last one"""
         print(p_wd, time.time())
         tmp_neighbor = utils.get_sent_from_word(self.redis, p_wd)
         if len(neighbor) == 0:
             neighbor.append(tmp_neighbor)
         if len(tmp_neighbor) > 0:
             if len(neighbor) > 0:
                 tmp = neighbor[-1] & tmp_neighbor
                 if len(neighbor[-1]) == len(tmp):
                     print("查询到高级词召回数量没有变化", len(tmp))
                     break
                 if len(tmp) > 0:
                     print("查询到高级词召回数量没有变化", len(tmp))
                     break
                 if len(tmp) == 0:
                     continue
                 else:
                     neighbor[-1] = tmp
         else:
             continue
     if len(neighbor) == 0:
         """there is no neighor here"""
         return []
     else:
         return list(neighbor[-1])
コード例 #15
0
 def handle_text(self,line):
     line = utils.clr(str(line))
     line_pre = utils.before_first_num(line)
     res = self.word_filter(line_pre)
     comm_nbs = []
     for i in range(len(res)-2):
         print(res)
         try:
             #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
             comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1])))
         except:
             print("networkx error")
             continue
         #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
         comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1])))
     result = self.common_nbs(comm_nbs)
     return result,res
コード例 #16
0
 def add_tree_route(self, sentence):
     sentence = utils.clr(sentence)
     wdlst = []
     #for word in jieba.cut(self.before_first_num(sentence)):
     for word in jieba.cut(sentence):
         wdlst.append(word)
         if len(wdlst) > 1:
             if wdlst[-2] in self.tree_clus_node and wdlst[
                     -1] in self.tree_clus_node:
                 res = self.tree_di.get_edge_data(wdlst[-2], wdlst[-1])
                 if not res == None:
                     wv = res['weight']
                     wv += 1
                     self.tree_di[wdlst[-2]][wdlst[-1]]['weight'] = wv
                     continue
             self.tree_di.add_edge(wdlst[-2], wdlst[-1])
             self.tree_di[wdlst[-2]][wdlst[-1]]["weight"] = 1
             self.tree_clus_node.add(wdlst[-2])
             self.tree_clus_node.add(wdlst[-1])
コード例 #17
0
 def route_text(self, line, lst):
     """key algor the search algor"""
     line = utils.clr(str(line))
     """filter left the text word"""
     """how to filter use the dict-tree"""
     res = self.word_filter(line)
     key_word_dict = {}
     logger.debug("过滤后词组" + ",".join(res))
     for word in res:
         key_word_dict[word] = self.graph.di.degree()[word]
     sorted_key_word_dict = sorted(key_word_dict.items(),
                                   key=lambda d: d[1],
                                   reverse=False)
     key_word_lst = [word[0] for word in sorted_key_word_dict]
     with open("key_word_lst.txt", "a+") as g:
         g.write(",".join(key_word_lst) + "\n")
     return set()
     words_route = []
     neighbor = []
     logger.debug("排序后词组" + ",".join(key_word_lst))
     for cursor in range(len(key_word_lst)):
         p_wd = key_word_lst[cursor]
         neighbor.append(p_wd)
         if len(neighbor) > 1:
             tmp_neighbor = utils.get_common_neighbor(
                 self.redis, neighbor[-2], neighbor[-1])
             if len(tmp_neighbor) == 0:
                 continue
             else:
                 words_route.append(tmp_neighbor)
                 return words_route[-1]
                 """
           if len(words_route)>0:
             tmp = tmp_neighbor & words_route[-1]
             if len(tmp)>0:
               words_route[-1] = tmp
             else:
               continue
           else:
             words_route.append(tmp_neighbor)
           """
     return words_route[-1] if len(words_route) > 0 else set()
コード例 #18
0
def seperate_zhengz_address(filename):
    rt = open("/home/dell/data/zhengz_train.txt","w+")
    wx = open("/home/dell/data/zhengz_dev.txt","w+")
    tmp = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            line = re.sub("[\r\n]","",line)
            line = re.sub("NONE","",line)
            line = re.sub(" ","",line)
            line = utils.clr(line)
            if 'ROOT' in line:
                qua,ans = line.split('ROOT')
                rt.write("%s %s 0\n"%(qua,ans))
            else:
                if len(tmp) == 2:
                    rt.write("%s %s 1\n"%(tmp[0],tmp[1]))
                    tmp = []
                else:
                    tmp.append(line)
    rt.close()
    wx.close()
コード例 #19
0
def handle_text(line):
    """celery -A tasks worker -Q handle_text --concurrency=4 -l info -E -n worker1@%h"""
    line = utils.clr(str(line))
    line_pre = utils.before_first_num(line)
    res = address_activity.word_filter(line_pre)
    comm_nbs = []
    for i in range(len(res) - 2):
        print(res)
        try:
            comm_nbs.append(
                list(
                    nx.common_neighbors(address_activity.graph.di, res[i],
                                        res[i + 1])))
        except:
            print("networkx error")
            continue
        comm_nbs.append(
            list(
                nx.common_neighbors(address_activity.graph.di, res[i],
                                    res[i + 1])))
    result = address_activity.common_nbs(comm_nbs)
    return result, res
コード例 #20
0
 def query_one(self, line):
     line = utils.clr(str(line))
     line_pre = utils.before_first_num(line)
     #fir_num= utils.first_numbers(line)
     res = self.word_filter(line_pre)
     res.extend(utils.numbers(line))
     comm_nbs = []
     for i in range(len(res)-1):
         print(res)
         try:
             #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
             comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1])))
         except:
             print("networkx error")
             continue
         #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
         comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1])))
     result = self.common_nbs(comm_nbs)
     _result = []
     for i in result:
         if not self.check_num(self.graph.sent[i],line):
             continue
         _result.append(self.graph.sent[i])
     return _result,res
コード例 #21
0
ファイル: address_activity.py プロジェクト: scmsqhn/code
 def route_text(self, line, lst):
     print("过滤掉无用文本 ", line, lst)
     line = utils.clr(str(line))
     #line_pre = utils.before_first_num(line)
     res = self.word_filter(line)
     print("经过过滤的词条", res)
     #res.extend(lst)
     words_route = []
     comm_nbs = []
     if len(res) == 1:
         res.extend(res)
     for i in range(len(res) - 1):
         print(res)
         try:
             #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
             p_node = res[i]
             a_node = res[i + 1]
             if len(words_route) == 0:
                 words_route.append(p_node)
             try:
                 route = nx.shortest_path(self.graph.tree_di,
                                          words_route[-1], a_node)
                 print('是否存在最短路径 ', route)
                 words_route.append(a_node)
                 print("add node", i, a_node)
                 #weight = self.graph.tree_di[words_route[-1]][a_node]['weight']
                 #weight = self.graph.tree_di[words_route[-1]][a_node]['weight']
             except:
                 print(
                     "not connect direct, continue, find the next one, utile to the head of words lst"
                 )
                 print("过滤复杂文本的词条")
                 #words_route = words_route[:-1]
                 #words_route.append(a_node)
                 continue
         except:
             print("networkx error")
             continue
     #words_route = words_route[::-1]
     print("复杂文本", res)
     print("过滤输出", words_route)
     if " " in words_route:
         words_route.remove(" ")
     if len(words_route) > 0:
         words_route.insert(0, words_route[0])
     for i in range(len(words_route)):
         try:
             comm_nbs.extend(
                 list(nx.all_neighbors(self.graph.di, words_route[i])))
         except:
             print("添加邻居出错")
     print("所有的邻居都添加到列表中,等待计算")
     print("列表中共有多少个item", len(comm_nbs))
     cnt_lst = collections.Counter(comm_nbs)
     sorted_lst = sorted(cnt_lst.items(), key=lambda d: d[1], reverse=True)
     if not len(sorted_lst) > 0:
         return [], words_route
     max_value = sorted_lst[0][1]
     #result = self.common_nbs(comm_nbs)
     #result = self.common_nbs(comm_nbs)
     result = filter(lambda x: utils.is_max(x, max_value), sorted_lst)
     result = [i[0] for i in result]
     print("一共有多少个句子", len(result))
     print("公共邻居最多的句子", self.graph.sent[result[0]])
     print("公共邻居最少的句子", self.graph.sent[result[-1]])
     print("最终关键词", words_route)
     return result, words_route
コード例 #22
0
 def gen_csv(self, filename):
     df = pd.read_csv(filename)
     for i in df.iloc[:, 1]:
         yield utils.clr(str(i).strip())
コード例 #23
0
 def gen_txt(self, filename):
     f = open(filename, 'r')
     lines = f.readlines()
     for i in lines:
         yield utils.clr(str(i).strip())