def process(args): # Create a graph from the training set nodedict = graph.records_to_graph() # print(args) # Build the model using DeepWalk and Word2Vec G = graph.load_adjacencylist("out.adj", undirected=True) # YOUR CODE HERE # print(args.number_walks) # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0)) walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0,rand=random.Random(0)) print len(walk) model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) print model # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1) # Perform some evaluation of the model on the test dataset with open("./data/test_user_ratings.dat") as fin: fin.next() groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating) tr = [int(round(float(g[2]))) for g in groundtruth] # print(groundtruth) pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth] # print(pr) print "MSE = %f" % mean_squared_error(tr, pr) print "accuracy = %f" % accuracy_score(tr, pr) cm = confusion_matrix(tr, pr, labels=range(1,6)) print cm
def process(args): G = graph.load_adjacencylist_npy(args.input, undirected=args.undirected) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) model.wv.save_word2vec_format(args.output)
def process(args): # Create a graph from the training set nodedict = graph.records_to_graph() # Build the model using DeepWalk and Word2Vec G = graph.load_adjacencylist("out.adj", undirected=True) walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) # Perform some evaluation of the model on the test dataset with open("./data/test_user_ratings.dat") as fin: fin.readline() groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating) tr = [int(round(float(g[2]))) for g in groundtruth] pr = [ predict_rating(model, nodedict, "u" + g[0], "m" + g[1]) for g in groundtruth ] print("MSE = %f" % mean_squared_error(tr, pr)) print("accuracy = %f" % accuracy_score(tr, pr)) cm = confusion_matrix(tr, pr, labels=range(1, 6)) print(cm)
def graph2walks(self, method="", params={}): self.params = params if method == "deepwalk": number_of_walks = self.params['number_of_walks'] walk_length = self.params['walk_length'] alpha = self.params['alpha'] # Temporarily generate the edge list with open("./temp/graph.edgelist", 'w') as f: for line in nx.generate_edgelist(self.graph, data=False): f.write("{}\n".format(line)) dwg = deepwalk.load_edgelist("./temp/graph.edgelist", undirected=True) corpus = deepwalk.build_deepwalk_corpus(G=dwg, num_paths=number_of_walks, path_length=walk_length, alpha=alpha, rand=random.Random(0)) elif method == "node2vec": number_of_walks = self.params['number_of_walks'] walk_length = self.params['walk_length'] p = self.params['p'] q = self.params['q'] for edge in self.graph.edges(): self.graph[edge[0]][edge[1]]['weight'] = 1 G = node2vec.Graph(nx_G=self.graph, p=p, q=q, is_directed=False) G.preprocess_transition_probs() corpus = G.simulate_walks(num_walks=number_of_walks, walk_length=walk_length) else: raise ValueError("Invalid method name!") """ new_corpus = [] line_counter = 0 line = [] for walk in corpus: if line_counter < self.params['number_of_walks']: line.extend(walk) line_counter += 1 else: line_counter = 0 new_corpus.append(line) line = [] corpus = new_corpus """ self.corpus = corpus return self.corpus
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output, binary=False) print('saved!')
def construct_rpr_matrix(G, INDUCTIVE=False): ''' Construct Rooted PageRank matrix ''' print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * FLAGS.walk_times num_nodes = len(G.nodes()) print("Number of walks: {}".format(num_walks)) print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=FLAGS.walk_times, path_length=FLAGS.walk_length, alpha=FLAGS.alpha, rand=random.Random(FLAGS.seed)) all_counts = {} for node in walks.keys(): walks_n = walks[node] all_counts[node] = Counter() for walk in walks_n: all_counts[node].update(walk) print("Normal random walks started...") pairs = graph.write_normal_randomwalks( G, file_='./var/' + FLAGS.train_prefix + '_normal_walks.txt', rand=random.Random(FLAGS.seed)) print("Normal random walks dumped.") rpr_matrix = [] rpr_arg = [] for node in tqdm(xrange(num_nodes)): if node not in all_counts.keys(): raise NotImplementedError temp = all_counts[node].most_common(FLAGS.k_RPR) temp_arg = [i[0] for i in temp] temp_value = [i[1] for i in temp] if len(temp) < FLAGS.k_RPR: for _ in xrange(FLAGS.k_RPR - len(temp)): temp_value.append(0.0) temp_arg.append(node) temp_value = np.asarray(temp_value, dtype='double') temp_value = temp_value / sum(temp_value) rpr_matrix.append(temp_value) rpr_arg.append(temp_arg) rpr_matrix = np.asarray(rpr_matrix, dtype='double') rpr_arg = np.asarray(rpr_arg, dtype='double') rpr_file = './var/' + FLAGS.train_prefix + '_rpr.mat' sio.savemat(rpr_file, {'rpr_matrix': rpr_matrix}) return rpr_matrix, pairs, rpr_arg
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) # G = graphConstruction.buildGraphAPA() print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def graph_walk_data(data_path=None): p_train_path = os.path.join(data_path, "train.txt") p_valid_path = os.path.join(data_path, "valid.txt") p_test_path = os.path.join(data_path, "test.txt") train_path = os.path.join(data_path, "train.edgelist") valid_path = os.path.join(data_path, "valid.edgelist") test_path = os.path.join(data_path, "test.edgelist") word2id = _build_vocab(p_train_path) _graph_to_edgelist(p_train_path, train_path, word2id) _graph_to_edgelist(p_valid_path, valid_path, word2id) _graph_to_edgelist(p_test_path, test_path, word2id) json.dump(word2id, open(os.path.join(data_path, "word2id.json"), 'w')) G_train = graph.load_edgelist(train_path) G_valid = graph.load_edgelist(valid_path) G_test = graph.load_edgelist(test_path) train_walks = graph.build_deepwalk_corpus(G_train, list_exclud=[], num_paths=30, path_length=50) valid_walks = graph.build_deepwalk_corpus(G_valid, list_exclud=[], num_paths=30, path_length=50) test_walks = graph.build_deepwalk_corpus(G_test, list_exclud=[], num_paths=30, path_length=50) vacabulary = len(word2id) return train_walks, valid_walks, test_walks, vacabulary
def graph_walk_data(data_path=None): p_train_path = os.path.join(data_path, "train.txt") train_path = os.path.join(data_path, "train.edgelist") word2id = _build_vocab(p_train_path) _graph_to_edgelist(p_train_path, train_path, word2id) print("export word2id file....") json.dump(word2id, open(os.path.join(data_path, "word2id.json"), 'w')) print("word2id file exported.") G_train = graph.load_edgelist(train_path) train_walks = graph.build_deepwalk_corpus(G_train, list_exclud=[], num_paths=RWConf["num_paths"], path_length=RWConf["path_length"]) vocabulary = len(word2id) return train_walks, vocabulary
def perform_random_walks(self, output_node_corpus_file): if not ('number_of_walks' and 'walk_length') in self.params.keys() or self.graph is None: raise ValueError("Missing parameter !") self.number_of_nodes = self.graph.number_of_nodes() self.N = self.number_of_nodes * self.params['number_of_walks'] self.L = self.params['walk_length'] initial_time = time.time() # Generate a corpus if self.params['random_walk'] == "deepwalk": if not ('dw_alpha') in self.params.keys(): raise ValueError("A parameter is missing!") # Temporarily generate the edge list with open(os.path.join(self.temp_folder, "graph_deepwalk.edgelist"), 'w') as f: for line in nx.generate_edgelist(self.graph, data=False): f.write("{}\n".format(line)) dwg = deepwalk.load_edgelist(os.path.join(self.temp_folder, "graph_deepwalk.edgelist"), undirected=True) self.corpus = deepwalk.build_deepwalk_corpus(G=dwg, num_paths=self.params['number_of_walks'], path_length=self.params['walk_length'], alpha=self.params['dw_alpha']) elif self.params['random_walk'] == "node2vec": if not ('n2v_p' and 'n2v_q') in self.params.keys(): raise ValueError("A missing parameter exists!") for edge in self.graph.edges(): self.graph[edge[0]][edge[1]]['weight'] = 1 G = node2vec.Graph(nx_G=self.graph, p=self.params['n2v_p'], q=self.params['n2v_q'], is_directed=False) G.preprocess_transition_probs() self.corpus = G.simulate_walks(num_walks=self.params['number_of_walks'], walk_length=self.params['walk_length']) else: raise ValueError("Invalid method name!") self.save_corpus(output_node_corpus_file, with_title=False) print("The corpus was generated in {:.2f} secs.".format(time.time() - initial_time))
def process(args): # Create a graph from the training set nodedict = graph.records_to_graph() # print(args) # Build the model using DeepWalk and Word2Vec G = graph.load_adjacencylist("out.adj", undirected=True) # YOUR CODE HERE # print(args.number_walks) # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0)) walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0, rand=random.Random(0)) print len(walk) model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) print model # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1) # Perform some evaluation of the model on the test dataset with open("./data/test_user_ratings.dat") as fin: fin.next() groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating) tr = [int(round(float(g[2]))) for g in groundtruth] # print(groundtruth) pr = [ predict_rating(model, nodedict, "u" + g[0], "m" + g[1]) for g in groundtruth ] # print(pr) print "MSE = %f" % mean_squared_error(tr, pr) print "accuracy = %f" % accuracy_score(tr, pr) cm = confusion_matrix(tr, pr, labels=range(1, 6)) print cm
def process(args): #if args.format == "adjlist": # G = graph.load_adjacencylist(args.input, undirected=args.undirected) #elif args.format == "edgelist": # G = graph.load_edgelist(args.input, undirected=args.undirected) #elif args.format == "mat": # G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) if args.format == "w_edgelist": G = graph.load_weighted_edgelist(args.input, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. This version supports only 'w_edgelist'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if True: print("Initailizing...") vertex_counts = G.degree(nodes=G.iterkeys()) #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model = Skipgram(sentences=None, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers, sg=args.sg) print("Walking & Training...") sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks+1)) for i in xrange(args.number_walks): sys.stderr.write("\rprogress: %.2f %% [%d/%d] (walk step) " % ((i)*100./(args.number_walks+1), i+1, args.number_walks+1)) sys.stderr.flush() walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers) sys.stderr.write("\rprogress: %.2f %% [%d/%d] (train step) " % ((i+.5)*100./(args.number_walks+1), i+1, args.number_walks+1)) sys.stderr.flush() #model.build_vocab(walks) model.train(walks) sys.stderr.write("\rprogress: 100.00 %%\n") sys.stderr.flush() else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, test_links_ratio=args.test_links, test_links_file=args.test_links_file, train_links_file=args.train_links_file) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) if args.heuristic_wrb_for_wbr is not None: wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr)) print(wrb, err) return if (args.weighted is not None) and (args.weighted != 'unweighted'): G = graph.set_weights(G, args.weighted) if args.just_write_graph: with open('wgraph.out', 'w') as fout: if args.weighted == 'unweighted': for v in G: s = len(G[v]) for u in G[v]: fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n') elif args.weighted.startswith('random_walk'): for v in G: for u, w in zip(G[v], G.edge_weights[v]): fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n') else: raise Exception('just-write-graph is not supported for this weighting method') return None num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output)
rg = randomgraph.RanGraphGen() rg.set_model(model=lfr_params) g = rg.lfr_model() graph_path = "./outputs/lfr_synthetic_n1000.gml" nx.write_gml(g, graph_path) # Find the embedding of the temp_adjlist_file = "./temp/graph.adjlist" embedding_file = "./outputs/output.embedding" nx.write_edgelist(g, temp_adjlist_file) dwg = dw.load_edgelist(temp_adjlist_file, undirected=True) walks = dw.build_deepwalk_corpus(dwg, num_paths=dw_params['n'], path_length=dw_params['l'], alpha=0) model = Word2Vec(walks, size=dw_params['d'], window=dw_params['w'], min_count=0, sg=1, hs=1, workers=dw_params['workers']) model.wv.save_word2vec_format(embedding_file) comdetect = CommunityDetection(embedding_file, graph_path, params={'directed': False}) score = comdetect.evaluate(num_of_communities=kmeans_num_of_communities) print("Score: {}".format(score))
try: f = open(path + 'sbm_node_labels.pickle', 'rb') node_colors = pickle.load(f) except UnicodeDecodeError: f.seek(0) node_colors = pickle.load(f, encoding='latin1') node_colors_arr = [None] * node_colors.shape[0] for idx in range(node_colors.shape[0]): node_colors_arr[idx] = np.where(node_colors[idx, :].toarray() == 1)[1][0] models = ['manela'] for model in models: if model == 'deepwalk': gr = graph.from_networkx(G, undirected=True) walks = graph.build_deepwalk_corpus(gr, 10, 80, 0) model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, hs=0, negative=5, workers=4, iter=1) emb_matrix = np.zeros((len(gr), 128)) for key in range(len(gr)): emb_matrix[key] = model.wv.get_vector(str(key)) elif model == 'manela': gr = graph.from_networkx(G, undirected=True)
def evaluatePrediction(ori_graph, emb_name=['dnela'], train_ratio=0.8, sample_nodes=None, v1=[None], v2=[None]): #1. split the original graph to train and test. Remove edges from original graph #to create train graph, the complimentary part left is test graph #if the split train graph is not connected, return the max connected component print(ori_graph.order()) print(str(ori_graph.is_connected())) train_graph, test_graph = graph.graph_splitter(ori_graph, train_ratio) if not train_graph.is_connected(): train_graph = max(graph.weak_connected_components(train_graph), key=len) train_nodes = list(train_graph.keys()) train_nodes_dict = dict(zip(train_nodes, range(len(train_nodes)))) train_graph = graph.re_label_nodes(train_graph, train_nodes_dict) test_graph = test_graph.subgraph(train_nodes) test_graph = graph.re_label_nodes(test_graph, train_nodes_dict) node_num = train_graph.order() print(node_num) MAP = [None] * len(emb_name) precision_curve = [None] * len(emb_name) auc = [None] * len(emb_name) if sample_nodes: if sample_nodes < node_num: trimed_test_graph, node_l = graph.sample_graph( test_graph, sample_nodes) for k, name in enumerate(emb_name): #2. train embeddings using methods specified if name == 'manela': emb = ds.Distributed(train_graph) emb.setArgs(numUpdates=v1[k], outputPath='temp_emb.embeddings', ratio=v2[k]) emb.process() emb_matrix = emb.getEmbeddings() elif name == 'deepwalk': walks = graph.build_deepwalk_corpus(train_graph, 10, 80, 0) model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, hs=0, negative=5, workers=4, iter=1) emb_matrix = zeros((node_num, 128)) for key in range(node_num): emb_matrix[key] = model.wv.get_vector(str(key)) elif name == 'node2vec': #1. transform graph format from graph to nx.Graph() ngraph = nx.Graph() for key, value in train_graph.items(): for adj in value: ngraph.add_edge(key, adj, weight=1) ngraph.to_undirected() G = n2v.Graph(ngraph, False, v1[k], v2[k]) G.preprocess_transition_probs() walks = G.simulate_walks(10, 80) walks = [list(map(str, walk)) for walk in walks] model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, hs=0, negative=5, workers=4, iter=1) emb_matrix = zeros((node_num, 128)) for key in range(node_num): emb_matrix[key] = model.wv.get_vector(str(key)) else: pass #3. sample some nodes for validation if name == 'common_neighbors': ori_test_graph = copy.deepcopy(test_graph) if name == 'manela' or name == 'deepwalk' or name == 'node2vec': emb_matrix = emb_matrix[node_l] #4. construct node weights from embeddings if name == 'common_neighbors': result_pair_list = eu.get_edge_list_from_cn(node_l, ori_test_graph, threshold=-1) else: adj_matrix = eu.get_recontructed_adj(emb_matrix) result_pair_list = eu.get_edge_list_from_adj(adj_matrix, threshold=-100000) #filter the result edge list from those appeared in train_graph #NOTE: THIS STEP IS IMPORTANT SINCE train_set HERE IS COMPLETE, NOT SAMPLED WHILE test_graph #IS SAMPLED SO THEY HAVE DIFFRENT LABELS. THIS DICTIONARY IS FOR NODE TRANSLATION filtered_pair_list = [ pair for pair in result_pair_list if not train_graph.has_edge(node_l[pair[0]], node_l[pair[1]]) ] #5. compute MAP and precision curve MAP[k] = eu.compute_map(filtered_pair_list, trimed_test_graph, max_k=-1) precision_curve[k], _, auc[k] = eu.compute_precision_curves( filtered_pair_list, trimed_test_graph, max_k=1024, a=True) return MAP, precision_curve, auc
def getEmbeddings(self, relationships): G = graph.load_py4jclient(relationships) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * self.args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * self.args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < self.args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus( G, num_paths=self.args.number_walks, path_length=self.args.walk_length, alpha=0, rand=random.Random(self.args.seed)) print("Training...") model = Word2Vec(walks, size=self.args.representation_size, window=self.args.window_size, min_count=0, sg=1, hs=1, workers=self.args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, self.args.max_memory_data_size)) print("Walking...") walks_filebase = self.args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=self.args.number_walks, path_length=self.args.walk_length, alpha=0, rand=random.Random(self.args.seed), num_workers=self.args.workers) print("Counting vertex frequency...") if not self.args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, self.args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=self.args.representation_size, window=self.args.window_size, min_count=0, trim_rule=None, workers=self.args.workers) # to_return = {} # for word, vec in zip(model.wv.vocab, model.wv.vectors): # to_return[word] = " ".join([for str(x) in vec]) to_return = "" for word, vec in zip(model.wv.vocab, model.wv.vectors): vector_str = " ".join([str(x) for x in vec]) to_return = to_return + word + "\t" + vector_str + "\n" print(to_return) # from py4j.java_collections import SetConverter, MapConverter, ListConverter # to_return = MapConverter().convert(to_return, client) # to_return = D() # for word, vec in zip(model.wv.vocab, model.wv.vectors): # to_return.word = str(vec) return to_return
lr=learning_rate, weight_decay=weight_decay) #print(model.parameters()) model.cuda() features = features.cuda() adj = adj.cuda() labels = labels.cuda() idx_train = idx_train.cuda() idx_val = idx_val.cuda() idx_test = idx_test.cuda() G = graph.from_numpy(adjn, undirected=True) walks_sq = graph.build_deepwalk_corpus(G, num_paths=1, path_length=20, alpha=0, rand=random.Random(0)) #print(walks.__next__()) walks_sq = np.array(walks_sq) #print(walks.shape) #inputs = np.empty([2708, 20,1433], dtype = int) walks = torch.empty([2708, 20, 1433], dtype=torch.float) walks = walks.cuda() #print(features[walks_sq[0][0]]) for i in range(0, 2078): for j in range(0, 20): walks[i][j] = features[walks_sq[i][j]]
def deepwalk_get_feature(args, adj_indices, result_path): model_path = result_path + '.model' if os.path.exists(model_path): return Word2Vec.load(model_path) G = graph.load_edgelist(adj_indices, undirected=args.undirected) print(G) if len(G) < 10: print('输出随机游走点太少') return [] print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.dataset + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(result_path + '.feature') model.save(model_path) return model
def process(args): #if args.format == "adjlist": # G = graph.load_adjacencylist(args.input, undirected=args.undirected) #elif args.format == "edgelist": # G = graph.load_edgelist(args.input, undirected=args.undirected) #elif args.format == "mat": # G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) if args.format == "w_edgelist": G = graph.load_weighted_edgelist(args.input, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. This version supports only 'w_edgelist'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if True: print("Initailizing...") vertex_counts = G.degree(nodes=G.iterkeys()) #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model = Skipgram(sentences=None, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers, sg=args.sg) print("Walking & Training...") sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks + 1)) for i in xrange(args.number_walks): sys.stderr.write( "\rprogress: %.2f %% [%d/%d] (walk step) " % ((i) * 100. / (args.number_walks + 1), i + 1, args.number_walks + 1)) sys.stderr.flush() walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers) sys.stderr.write( "\rprogress: %.2f %% [%d/%d] (train step) " % ((i + .5) * 100. / (args.number_walks + 1), i + 1, args.number_walks + 1)) sys.stderr.flush() #model.build_vocab(walks) model.train(walks) sys.stderr.write("\rprogress: 100.00 %%\n") sys.stderr.flush() else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def main(graph_fname, node_vec_fname, options): '''\ %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname> graph_fname: the graph file It can be a file contained edges per line (e.g., res/karate_club_edges.txt) or a pickled graph file. node_vec_fname: the output file for nodes' vectors ''' print 'Load a road Graph...' # g = loader.load_a_HIN(graph_fname) G = graph.load_edgelist(graph_fname, undirected=True) print 'Generate random walks...' print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * options.walk_num print("Number of walks: {}".format(num_walks)) data_size = num_walks * options.walk_length print("Data size (walks*length): {}".format(data_size)) print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=options.walk_num, path_length=options.walk_length, alpha=0, rand=random.Random(0)) tmp_walk_fname = "tmp_walk_fname.txt" tmp_walk_json = "tmp_walk_fname.json" with open(tmp_walk_json, 'w+') as tmp_walks: tmp_walks.write(json.dumps(walks)) with open(tmp_walk_fname, 'w') as f: for walk in walks: f.write('%s\n' % ' '.join(map(str, walk))) print("Walking done...") model = MP2Vec( size=options.dim, window=options.window, neg=options.neg, num_processes=options.num_processes, alpha=options.alpha, same_w=True, normed=False, ) neighbors = None # {node_osmid: [<node_osmid>, <node_osmid>, ...]} if options.correct_neg: for id_ in G: G._get_k_hop_neighborhood(id_, options.window) neighbors = G.k_hop_neighbors[options.window] model.train(G, walks, k_hop_neighbors=neighbors) print 'Dump vectors...' model.dump_to_file(node_vec_fname, type_='node') return 0
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) if (os.path.isfile(format(args.excludlist))): #num_exlud = number_excluded_nodes(args.excludlist) list_exclud = open(args.excludlist).readlines() list_exclud = [int(x) for x in list_exclud] list_exclud = set(list_exclud) num_exlud = len(set(list_exclud)) else: num_exlud = 0 list_exclud = [] if (num_exlud > 0): print("Number of nodes excluded from the walk: {}".format(num_exlud)) #num_walks = (len(G.nodes()) - num_exlud) * args.number_walks num_walks = (len(G.nodes()) - num_exlud) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, list_exclud=list_exclud, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, list_exclud, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.wv.save_word2vec_format(args.output)