def homogeneous_graph_random_walks_for_large_bipartite_graph( self, datafile, percentage, maxT, minT): G = graph.load_edgelist(datafile, undirected=True) A, row_index, item_index = bi.biadjacency_matrix(self.G, self.node_u, self.node_v, dtype=np.float, weight='weight', format='csr') index_row = dict(zip(row_index.values(), row_index.keys())) index_item = dict(zip(item_index.values(), item_index.keys())) AT = A.transpose() matrix_u = self.get_homogenous_graph(A.dot(AT), self.fw_u, index_row, index_row) matrix_v = self.get_homogenous_graph(AT.dot(A), self.fw_v, index_item, index_item) self.G_u, self.walks_u = self.get_random_walks_restart_for_large_bipartite_graph( matrix_u, self.authority_u, percentage=percentage, maxT=maxT, minT=minT) self.G_v, self.walks_v = self.get_random_walks_restart_for_large_bipartite_graph( matrix_v, self.authority_v, percentage=percentage, maxT=maxT, minT=minT)
def read_graph(): ''' Reads the input network. ''' print(" - Loading graph...") G = graph.load_edgelist(FLAGS.input, undirected=True) print(" - Graph loaded.") return G
def read_sentences_and_homogeneous_graph(self, filesentences=None, datafile=None): G = graph.load_edgelist(datafile, undirected=True) walks = [] with open(filesentences,"r") as fin: for line in fin.readlines(): walk = line.strip().split(" ") walks.append(walk) return G, walks
def read_graph(): ''' Reads the input network. ''' logging.info(" - Loading graph...") G = graph.load_edgelist(args.input, undirected=True) logging.info(" - Graph loaded.") return G
def get_random_walks_restart(self, datafile, hits_dict, percentage, maxT, minT): if datafile is None: datafile = os.path.join(self.model_path,"rating_train.dat") G = graph.load_edgelist(datafile, undirected=True) print("number of nodes: {}".format(len(G.nodes()))) print("walking...") walks = graph.build_deepwalk_corpus_random(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0) print("walking...ok") return G, walks
def read_graph(args): """ Reads the input network. """ logging.info(" - Loading graph...") graph_dict, in_degrees, out_degrees = graph.load_edgelist( args.input, args.directed, args.weighted) logging.info(" - Graph loaded.") return graph_dict, in_degrees, out_degrees
def graph2walks(self, method="", params={}): self.params = params if method == "deepwalk": number_of_walks = self.params['number_of_walks'] walk_length = self.params['walk_length'] alpha = self.params['alpha'] # Temporarily generate the edge list with open("./temp/graph.edgelist", 'w') as f: for line in nx.generate_edgelist(self.graph, data=False): f.write("{}\n".format(line)) dwg = deepwalk.load_edgelist("./temp/graph.edgelist", undirected=True) corpus = deepwalk.build_deepwalk_corpus(G=dwg, num_paths=number_of_walks, path_length=walk_length, alpha=alpha, rand=random.Random(0)) elif method == "node2vec": number_of_walks = self.params['number_of_walks'] walk_length = self.params['walk_length'] p = self.params['p'] q = self.params['q'] for edge in self.graph.edges(): self.graph[edge[0]][edge[1]]['weight'] = 1 G = node2vec.Graph(nx_G=self.graph, p=p, q=q, is_directed=False) G.preprocess_transition_probs() corpus = G.simulate_walks(num_walks=number_of_walks, walk_length=walk_length) else: raise ValueError("Invalid method name!") """ new_corpus = [] line_counter = 0 line = [] for walk in corpus: if line_counter < self.params['number_of_walks']: line.extend(walk) line_counter += 1 else: line_counter = 0 new_corpus.append(line) line = [] corpus = new_corpus """ self.corpus = corpus return self.corpus
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output, binary=False) print('saved!')
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) # G = graphConstruction.buildGraphAPA() print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def read_graph(args): """ Reads the input network. """ from utils import save_variable_on_disk logging.info(" - Loading graph...") graph_dict, in_degrees, out_degrees, val_density = graph.load_edgelist( args.input, args.directed, args.weighted) save_variable_on_disk(val_density, "density") logging.info(" - Graph Density Save.") ss = restore_variable_from_disk("density") logging.info(" - Graph loaded.") return graph_dict, in_degrees, out_degrees, val_density
def get_random_walks_restart_for_large_bipartite_graph_without_generating(self, datafile, hits_dict, percentage, maxT, minT, node_type='u'): if datafile is None: datafile = os.path.join(self.model_path,"rating_train.dat") G = graph.load_edgelist(datafile, undirected=True) cnt = 0 for n in G.nodes(): if n[0] == node_type: cnt += 1 print("number of nodes: {}".format(cnt)) print("walking...") walks = graph.build_deepwalk_corpus_random_for_large_bibartite_graph(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0,node_type=node_type) # print(walks) print("walking...ok") return G, walks
def graph_walk_data(data_path=None): p_train_path = os.path.join(data_path, "train.txt") p_valid_path = os.path.join(data_path, "valid.txt") p_test_path = os.path.join(data_path, "test.txt") train_path = os.path.join(data_path, "train.edgelist") valid_path = os.path.join(data_path, "valid.edgelist") test_path = os.path.join(data_path, "test.edgelist") word2id = _build_vocab(p_train_path) _graph_to_edgelist(p_train_path, train_path, word2id) _graph_to_edgelist(p_valid_path, valid_path, word2id) _graph_to_edgelist(p_test_path, test_path, word2id) json.dump(word2id, open(os.path.join(data_path, "word2id.json"), 'w')) G_train = graph.load_edgelist(train_path) G_valid = graph.load_edgelist(valid_path) G_test = graph.load_edgelist(test_path) train_walks = graph.build_deepwalk_corpus(G_train, list_exclud=[], num_paths=30, path_length=50) valid_walks = graph.build_deepwalk_corpus(G_valid, list_exclud=[], num_paths=30, path_length=50) test_walks = graph.build_deepwalk_corpus(G_test, list_exclud=[], num_paths=30, path_length=50) vacabulary = len(word2id) return train_walks, valid_walks, test_walks, vacabulary
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) print("Walking...") start = time.time() walks_filebase = args.output + ".txt" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) # print("Counting vertex frequency...") # if not args.vertex_freq_degree: # vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) # else: # # use degree distribution for frequency in tree # vertex_counts = G.degree(nodes=G.iterkeys()) end = time.time() exe_time = end - start print("--------- walking time: {:.5f} -----------".format(exe_time))
def graph_walk_data(data_path=None): p_train_path = os.path.join(data_path, "train.txt") train_path = os.path.join(data_path, "train.edgelist") word2id = _build_vocab(p_train_path) _graph_to_edgelist(p_train_path, train_path, word2id) print("export word2id file....") json.dump(word2id, open(os.path.join(data_path, "word2id.json"), 'w')) print("word2id file exported.") G_train = graph.load_edgelist(train_path) train_walks = graph.build_deepwalk_corpus(G_train, list_exclud=[], num_paths=RWConf["num_paths"], path_length=RWConf["path_length"]) vocabulary = len(word2id) return train_walks, vocabulary
def perform_random_walks(self, output_node_corpus_file): if not ('number_of_walks' and 'walk_length') in self.params.keys() or self.graph is None: raise ValueError("Missing parameter !") self.number_of_nodes = self.graph.number_of_nodes() self.N = self.number_of_nodes * self.params['number_of_walks'] self.L = self.params['walk_length'] initial_time = time.time() # Generate a corpus if self.params['random_walk'] == "deepwalk": if not ('dw_alpha') in self.params.keys(): raise ValueError("A parameter is missing!") # Temporarily generate the edge list with open(os.path.join(self.temp_folder, "graph_deepwalk.edgelist"), 'w') as f: for line in nx.generate_edgelist(self.graph, data=False): f.write("{}\n".format(line)) dwg = deepwalk.load_edgelist(os.path.join(self.temp_folder, "graph_deepwalk.edgelist"), undirected=True) self.corpus = deepwalk.build_deepwalk_corpus(G=dwg, num_paths=self.params['number_of_walks'], path_length=self.params['walk_length'], alpha=self.params['dw_alpha']) elif self.params['random_walk'] == "node2vec": if not ('n2v_p' and 'n2v_q') in self.params.keys(): raise ValueError("A missing parameter exists!") for edge in self.graph.edges(): self.graph[edge[0]][edge[1]]['weight'] = 1 G = node2vec.Graph(nx_G=self.graph, p=self.params['n2v_p'], q=self.params['n2v_q'], is_directed=False) G.preprocess_transition_probs() self.corpus = G.simulate_walks(num_walks=self.params['number_of_walks'], walk_length=self.params['walk_length']) else: raise ValueError("Invalid method name!") self.save_corpus(output_node_corpus_file, with_title=False) print("The corpus was generated in {:.2f} secs.".format(time.time() - initial_time))
def get_random_walks_restart_for_large_bipartite_graph_without_generating(self, datafile, hits_dict, percentage, maxT, minT, node_type='u'): if datafile is None: datafile = os.path.join(self.model_path,"rating_train.dat") # todo 8. change datafile so that # >gene has u as a first string # >disease has i as a first string G = graph.load_edgelist(datafile, undirected=True) cnt = 0 for n in G.nodes(): if n[0] == node_type: cnt += 1 print("number of nodes: {}".format(cnt)) print("walking...") # todo 5. # input hits_dict is empty because authority_v and authority_u are empty # :What is authority v and u for? # output walk = {} walks = graph.build_deepwalk_corpus_random_for_large_bibartite_graph(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0,node_type=node_type) # print(walks) print("walking...ok") return G, walks
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) if (os.path.isfile(format(args.excludlist))): #num_exlud = number_excluded_nodes(args.excludlist) list_exclud = open(args.excludlist).readlines() list_exclud = [int(x) for x in list_exclud] list_exclud = set(list_exclud) num_exlud = len(set(list_exclud)) else: num_exlud = 0 list_exclud = [] if (num_exlud > 0): print("Number of nodes excluded from the walk: {}".format(num_exlud)) #num_walks = (len(G.nodes()) - num_exlud) * args.number_walks num_walks = (len(G.nodes()) - num_exlud) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, list_exclud=list_exclud, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, list_exclud, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.wv.save_word2vec_format(args.output)
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, test_links_ratio=args.test_links, test_links_file=args.test_links_file, train_links_file=args.train_links_file) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) if args.heuristic_wrb_for_wbr is not None: wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr)) print(wrb, err) return if (args.weighted is not None) and (args.weighted != 'unweighted'): G = graph.set_weights(G, args.weighted) if args.just_write_graph: with open('wgraph.out', 'w') as fout: if args.weighted == 'unweighted': for v in G: s = len(G[v]) for u in G[v]: fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n') elif args.weighted.startswith('random_walk'): for v in G: for u, w in zip(G[v], G.edge_weights[v]): fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n') else: raise Exception('just-write-graph is not supported for this weighting method') return None num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output)
dw_params['d'] = 128 dw_params['workers'] = 3 rg = randomgraph.RanGraphGen() rg.set_model(model=lfr_params) g = rg.lfr_model() graph_path = "./outputs/lfr_synthetic_n1000.gml" nx.write_gml(g, graph_path) # Find the embedding of the temp_adjlist_file = "./temp/graph.adjlist" embedding_file = "./outputs/output.embedding" nx.write_edgelist(g, temp_adjlist_file) dwg = dw.load_edgelist(temp_adjlist_file, undirected=True) walks = dw.build_deepwalk_corpus(dwg, num_paths=dw_params['n'], path_length=dw_params['l'], alpha=0) model = Word2Vec(walks, size=dw_params['d'], window=dw_params['w'], min_count=0, sg=1, hs=1, workers=dw_params['workers']) model.wv.save_word2vec_format(embedding_file) comdetect = CommunityDetection(embedding_file, graph_path,
def main(graph_fname, node_vec_fname, options): '''\ %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname> graph_fname: the graph file It can be a file contained edges per line (e.g., res/karate_club_edges.txt) or a pickled graph file. node_vec_fname: the output file for nodes' vectors ''' print 'Load a road Graph...' # g = loader.load_a_HIN(graph_fname) G = graph.load_edgelist(graph_fname, undirected=True) print 'Generate random walks...' print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * options.walk_num print("Number of walks: {}".format(num_walks)) data_size = num_walks * options.walk_length print("Data size (walks*length): {}".format(data_size)) print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=options.walk_num, path_length=options.walk_length, alpha=0, rand=random.Random(0)) tmp_walk_fname = "tmp_walk_fname.txt" tmp_walk_json = "tmp_walk_fname.json" with open(tmp_walk_json, 'w+') as tmp_walks: tmp_walks.write(json.dumps(walks)) with open(tmp_walk_fname, 'w') as f: for walk in walks: f.write('%s\n' % ' '.join(map(str, walk))) print("Walking done...") model = MP2Vec( size=options.dim, window=options.window, neg=options.neg, num_processes=options.num_processes, alpha=options.alpha, same_w=True, normed=False, ) neighbors = None # {node_osmid: [<node_osmid>, <node_osmid>, ...]} if options.correct_neg: for id_ in G: G._get_k_hop_neighborhood(id_, options.window) neighbors = G.k_hop_neighbors[options.window] model.train(G, walks, k_hop_neighbors=neighbors) print 'Dump vectors...' model.dump_to_file(node_vec_fname, type_='node') return 0
def read_graph(args): ''' Reads the input network. ''' G = graph.load_edgelist(args.input, undirected=True) return G
def deepwalk_get_feature(args, adj_indices, result_path): model_path = result_path + '.model' if os.path.exists(model_path): return Word2Vec.load(model_path) G = graph.load_edgelist(adj_indices, undirected=args.undirected) print(G) if len(G) < 10: print('输出随机游走点太少') return [] print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.dataset + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(result_path + '.feature') model.save(model_path) return model