class CorpusGraph: def __init__(self): self.corpus = nx.DiGraph() self.reversed_corpus_cache = None self.corpus_io = CorpusIO() # 需要mongodb def build_corpus(self): edges_gen = self.corpus_io.read_from_mongo(limit=None) for edge in edges_gen: self.corpus.add_edge(edge[0], edge[1], weight=edge[2]) # 将语料库的networkx实例转为json def to_json(self): json_obj = nx.to_dict_of_dicts(self.corpus) return json_obj # 将语料库的networkx实例存入硬盘,以json文件的形式 def save_as_json(self, path='./data/corpus.json'): json_obj = self.to_json() self.corpus_io.save_as_json(json_obj, path) # 从json文件读取一个networkx的语料库实例 def load_from_json(self, path='./data/corpus_in_use.json'): print("loading corpus json file: " + str(path)) json_obj = self.corpus_io.load_as_json(path) print("loaded") self.corpus = nx.from_dict_of_dicts(json_obj, create_using=self.corpus) def get_edge_weight(self, start, end): weight = 0 try: weight = self.corpus[start][end]['weight'] except KeyError: pass return weight def reverse(self): if self.reversed_corpus_cache is None: self.reversed_corpus_cache = self.corpus.reverse() tmp = self.corpus self.corpus = self.reversed_corpus_cache self.reversed_corpus_cache = tmp # 对于给定的字(key),取前K个最大的后接字 def get_sorted_neighbour(self, key, exclude=None, K=6): corpus = self.corpus # if reverse: # corpus = self.corpus.reverse() if key not in corpus.adj: return [] nbr = corpus.adj[key] rs = [] # print(nbr) # ########### 只需要获得前K个最大值,这里的排序可以优化(堆排序/K次冒泡排序...) #################### sorted_nbr = sorted(nbr.items(), key=lambda item: item[1]['weight'], reverse=True) j = 0 for i in range(K - 1): if j >= len(sorted_nbr): break # 循环K次,如果相邻字正好是下一个字,则跳过这个相邻字 if sorted_nbr[j][0] == exclude: j += 1 if j >= len(sorted_nbr): break rs.append((sorted_nbr[j][0], sorted_nbr[j][1]['weight'])) j += 1 remain_cnt = 0 remain_weight = 0 for i in range(K - 1, len(sorted_nbr)): if sorted_nbr[i][0] == exclude: continue remain_cnt += 1 remain_weight += sorted_nbr[i][1]['weight'] rs.append(("+" + str(remain_cnt), remain_weight)) return rs
class CorpusGraph: def __init__(self): self.corpus = nx.DiGraph() self.corpus_io = CorpusIO() def build_corpus(self): edges_gen = self.corpus_io.read_from_mongo(limit=None) for edge in edges_gen: self.corpus.add_edge(edge[0], edge[1], weight=edge[2]) def draw(self): nx.draw_networkx(self.corpus, font_family='SimHei', node_color='white') plt.show() def to_json(self): json = nx.to_dict_of_dicts(self.corpus) return json def save_as_json(self, path='./data/corpus.json'): json = self.to_json() self.corpus_io.save_as_json(json, path) def load_from_json(self, path='./data/corpus.json'): json = self.corpus_io.load_as_json(path) self.corpus = nx.from_dict_of_dicts(json, create_using=self.corpus) def get_edge_weight(self, start, end): weight = 0 try: weight = self.corpus[start][end]['weight'] except KeyError: pass return weight def get_sorted_neighbour(self, key, exclude=None, K=6): if key not in self.corpus.adj: return [] nbr = self.corpus.adj[key] rs = [] # print(nbr) # ###########只需要获得前K个最大值,这里的排序可以优化#################### sorted_nbr = sorted(nbr.items(), key=lambda item: item[1]['weight'], reverse=True) j = 0 for i in range(K - 1): if sorted_nbr[j][0] == exclude: j += 1 rs.append((sorted_nbr[j][0], sorted_nbr[j][1]['weight'])) j += 1 remain_cnt = 0 remain_weight = 0 for i in range(K - 1, len(sorted_nbr)): if sorted_nbr[i][0] == exclude: continue remain_cnt += 1 remain_weight += sorted_nbr[i][1]['weight'] rs.append(("+" + str(remain_cnt), remain_weight)) return rs