def read_graph(self): print("read_graph\n") G = nx.Graph() matrix = graph.load_matfile('blogcatalog.mat', variable_name='network') if issparse(matrix): cx = matrix.tocoo() for i, j, v in zip(cx.row, cx.col, cx.data): G.add_edge(i, j) for edge in G.edges(): G[edge[0]][edge[1]]['weight'] = 1 # G=G.to_undirected() return G
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output, binary=False) print('saved!')
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) # G = graphConstruction.buildGraphAPA() print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) print("Walking...") start = time.time() walks_filebase = args.output + ".txt" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) # print("Counting vertex frequency...") # if not args.vertex_freq_degree: # vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) # else: # # use degree distribution for frequency in tree # vertex_counts = G.degree(nodes=G.iterkeys()) end = time.time() exe_time = end - start print("--------- walking time: {:.5f} -----------".format(exe_time))
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, test_links_ratio=args.test_links, test_links_file=args.test_links_file, train_links_file=args.train_links_file) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) if args.heuristic_wrb_for_wbr is not None: wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr)) print(wrb, err) return if (args.weighted is not None) and (args.weighted != 'unweighted'): G = graph.set_weights(G, args.weighted) if args.just_write_graph: with open('wgraph.out', 'w') as fout: if args.weighted == 'unweighted': for v in G: s = len(G[v]) for u in G[v]: fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n') elif args.weighted.startswith('random_walk'): for v in G: for u, w in zip(G[v], G.edge_weights[v]): fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n') else: raise Exception('just-write-graph is not supported for this weighting method') return None num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input', help="input graph") parser.add_argument('--output', help="output result") parser.add_argument('--name', help="name of the method") parser.add_argument('--round', type=int, default=1, help="round") parser.add_argument('--u', type=float, help='hyper parameter 1') parser.add_argument('--v', type=float, help='hyper parameter 2') parser.add_argument('--all', action='store_true', help='validate all') args = parser.parse_args() num_shuffle = args.round ori_graph = graph.load_matfile(file_=args.input) ori_graph.make_undirected() ori_graph.make_consistent() train_ratio = 0.8 sample_node = 1024 map_round = [None] * num_shuffle curve_round = [None] * num_shuffle auc_round = [None] * num_shuffle print('start validating link prediction...') if not args.all: try: file = open(args.output, 'w') for round_id in range(num_shuffle): map_round[round_id], curve_round[round_id], auc_round[ round_id] = evaluatePrediction(ori_graph, [args.name], train_ratio, sample_node, [args.u], [args.v]) map_round[round_id] = map_round[round_id][0] curve_round[round_id] = curve_round[round_id][0] auc_round[round_id] = auc_round[round_id][0] file.write(str(map_round[round_id])) print('MAP:{} AUC:{}'.format(map_round[round_id], auc_round[round_id])) for i in curve_round[round_id]: file.write(" {}".format(i)) file.write("\n") file.write( str(numpy.mean(map_round)) + ' ' + str(numpy.std(map_round))) finally: file.close() else: uargs = [190, 1, 4] vargs = [0.4, 1, 4] try: file = open(args.output, 'w') for round_id in range(num_shuffle): map_round[round_id], curve_round[ round_id], auc_round[round_id] = evaluatePrediction( ori_graph, ['manela', 'deepwalk', 'node2vec'], train_ratio, sample_node, uargs, vargs) for m, a, curve in zip(map_round[round_id], auc_round[round_id], curve_round[round_id]): file.write(str(m)) file.write(" " + str(a)) for i in curve: file.write(" {}".format(i)) file.write("\n") finally: file.close() print("saved to file: {}".format(args.output))
def main(): ''' main method of the pre_experiment program parameters: ''' parser = argparse.ArgumentParser() parser.add_argument('-d', help="whether to use default values", action="store_true") parser.add_argument('-c', help="continued training path", action="store_true") parser.add_argument('--embpath', help="the embedding path for continued training") parser.add_argument('--path', help="path of the graph") parser.add_argument('--output', help="output path of the embeddings") parser.add_argument('--dimension', type=int, default=128, help="dimension of embeddings") parser.add_argument('--updates', type=int, help="number of updates") parser.add_argument('--alpha', type=float, default=0.025, help="initial learning rate") parser.add_argument('--negative', type=int, default=5, help="negative sampling number") parser.add_argument('--neglen', type=int, default=10**8, help="the maximum number used for negative sampling") parser.add_argument( '--ratio', type=float, help="the ratio of numbers of 1st and 2nd degree nodes") parser.add_argument( '--fmax', default=1, type=float, help= "the maximum ratio of # of 1st deg nodes participating updates to the total # of 1st deg nodes, when ratio=0.5" ) #parser.add_argument('-p',help="whether to use poisson process",action="store_true") parser.add_argument('--window', type=int, help="the update window in poisson update mode") parser.add_argument( '--timeslot', default=1000, type=int, help="the number of timeslots in order to simulate poisson process") parser.add_argument('--seed', default=1, type=int, help="the random seed of a Distributed instance") args = parser.parse_args() G = graph.load_matfile(file_=args.path) d = ds.Distributed(G) if args.d: print('using default settings') d.defaultArgs() else: d.setArgs(alpha=args.alpha, numUpdates=args.updates, numNegSampling=args.negative, maxNeglen=10**8, representSize=args.dimension, outputPath=args.output, ratio=args.ratio, fmax=args.fmax, c=args.c, cpath=args.embpath, poisson=True, window=args.window, timeslot=args.timeslot, seed=args.seed) d.process() d.save2File()
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) if (os.path.isfile(format(args.excludlist))): #num_exlud = number_excluded_nodes(args.excludlist) list_exclud = open(args.excludlist).readlines() list_exclud = [int(x) for x in list_exclud] list_exclud = set(list_exclud) num_exlud = len(set(list_exclud)) else: num_exlud = 0 list_exclud = [] if (num_exlud > 0): print("Number of nodes excluded from the walk: {}".format(num_exlud)) #num_walks = (len(G.nodes()) - num_exlud) * args.number_walks num_walks = (len(G.nodes()) - num_exlud) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, list_exclud=list_exclud, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, list_exclud, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.wv.save_word2vec_format(args.output)