def load_graph(self, input_address, output_name="g1_out.embeddings", number_walks=10, walk_length=40, max_memory_data_size=1000000000, matfile_variable_name="network", format='adjlist', undirected=True, representation_size=16, workers=1, window_size=5, vertex_freq_degree=False, seed=0): if format == "adjlist": G = graph.load_adjacencylist(input_address, undirected=undirected) elif format == "edgelist": G = graph.load_edgelist(input_address, undirected=undirected) elif format == "mat": G = graph.load_matfile(input_address, variable_name=matfile_variable_name, undirected=undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed)) print("Training...") model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1, workers=workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format( data_size, max_memory_data_size)) print("Walking...") walks_filebase = output_name + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed), num_workers=workers) print("Counting vertex frequency...") if not vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=representation_size, window=window_size, min_count=0, trim_rule=None, workers=workers) model.wv.save_word2vec_format("./dataset/{}".format(output_name))
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile( args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format( data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): #if args.format == "adjlist": # G = graph.load_adjacencylist(args.input, undirected=args.undirected) #elif args.format == "edgelist": # G = graph.load_edgelist(args.input, undirected=args.undirected) #elif args.format == "mat": # G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) if args.format == "w_edgelist": G = graph.load_weighted_edgelist(args.input, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. This version supports only 'w_edgelist'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if True: print("Initailizing...") vertex_counts = G.degree(nodes=G.iterkeys()) #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model = Skipgram(sentences=None, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers, sg=args.sg) print("Walking & Training...") sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks+1)) for i in xrange(args.number_walks): sys.stderr.write("\rprogress: %.2f %% [%d/%d] (walk step) " % ((i)*100./(args.number_walks+1), i+1, args.number_walks+1)) sys.stderr.flush() walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers) sys.stderr.write("\rprogress: %.2f %% [%d/%d] (train step) " % ((i+.5)*100./(args.number_walks+1), i+1, args.number_walks+1)) sys.stderr.flush() #model.build_vocab(walks) model.train(walks) sys.stderr.write("\rprogress: 100.00 %%\n") sys.stderr.flush() else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(edges_list, undirected=True, number_walks=10, walk_length=40, window_size=5, workers=1, dimensions=64, max_memory_data_size=1000000000, seed=0, vertex_freq_degree=False): G = graph.load_edgelist(edges_list, undirected=undirected) #print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * number_walks #print("Number of walks: {}".format(num_walks)) data_size = num_walks * walk_length #print("Data size (walks*length): {}".format(data_size)) if data_size < max_memory_data_size: # print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed)) # print("Training...") model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, workers=workers) else: # print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, max_memory_data_size)) # print("Walking...") walks_filebase = "karate.embeddings" + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed), num_workers=workers) # print("Counting vertex frequency...") if not vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) # print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=dimensions, window=window_size, min_count=0, workers=workers) #model.save_word2vec_format("karate.embeddings") return model
def run_dw(matrix, num_walks=100, walk_length=5, representation_size=32, window_size=2, undirected=True, seed=0, workers=1): random.seed(seed) np.random.seed(seed) adj_list = [] for n, edges in enumerate(matrix): adj_list.append([n] + edges.nonzero()[0].tolist()) print(adj_list) G = graph.from_adjlist(adj_list) if undirected: G.make_undirected() print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * num_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < 1000000000: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=num_walks, path_length=walk_length, alpha=0, rand=random.Random(seed)) print("Training...") model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1, workers=workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, 1000000000)) print("Walking...") walks_filebase = str(adj_list) + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=num_walks, path_length=walk_length, alpha=0, rand=random.Random(seed), num_workers=workers) print("Counting vertex frequency...") #if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, workers) #else: # # use degree distribution for frequency in tree # vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=representation_size, window=window_size, min_count=0, trim_rule=None, workers=workers, seed=seed) embeddings = np.zeros((len(G.nodes()), representation_size)) for i in range(len(G.nodes())): embeddings[i] = model.wv.get_vector(str(i)) return embeddings
def process(params, save=True): """ :param params: 传入参数用于训练 :param save: 是否保存 训练的数据 :return: """ if params["format"] == "adjlist": G = graph.load_adjacencylist(params["input"], undirected=params["undirected"]) elif params["format"] == "edgelist": G = graph.load_edgelist(params["input"], undirected=params["undirected"]) elif params["format"] == "mat": G = graph.load_matfile(params["input"], variable_name=params["matfile_variable_name"], undirected=params["undirected"]) else: print("输入格式有误,当前输入格式为 %s" % (params["format"])) raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', " "mat" % params["format"]) print("Number of node :{}".format(len(G.nodes()))) num_walks = len(G.nodes()) * params["number_walks"] print("Number of walks:{}".format(num_walks)) data_size = num_walks * params["walk_length"] print("Data size (walks*length):{}".format(data_size)) if data_size < params["max_memory_data_size"]: print("Walking...") walks = graph.build_deepwalk_corpus( G, num_paths=params.get("number_walks", 10), path_length=params.get("walk_length", 40), alpha=params.get("alpha", 0), rand=random.Random(params.get("seed", 0))) print("Training...") model = Word2Vec(walks, size=params.get("representation_size", 64), window=params.get("window_siz", 5), min_count=params.get("min_count", 0), sg=params.get("sg", 1), hs=params.get("hs", 1), workers=params.get("workers", 1)) else: print( "Data size{} is larger than limit(max-memory-data-size:{}).Dumping walks t disk." .format(data_size, params.get("max_memory_data_size"))) print("walking...") walks_filebase = params["output"] + ".walks" walks_files = wk.write_walks_to_disk( G, walks_filebase, num_paths=params.get("number_walks", 10), path_length=params.get("walk_length", 40), alpha=params.get("alpha", 0), rand=random.Random(params.get("seed", 0)), num_workers=params.get("workers", 1)) print("Counting vertex frequecy...") # 统计节点频次 if params["vertex_freq_degree"]: vertex_counts = wk.count_textfiles(walks_files, params["workers"]) else: vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = wk.WalksCorpus(walks_files) # walk 语料 model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=params.get("representation_size"), window=params.get("windows_size", 80), min_count=params.get("min_count", 0), trim_rule=params.get("trim_rule", None), workers=params.get("workers", 8)) if save == True: model.wv.save_word2vec_format(params["output"]) # 对模型进行保存 else: models = model.wv.load_word2vec_format(params["output"]) # 加载模型. return models
def process(args): #if args.format == "adjlist": # G = graph.load_adjacencylist(args.input, undirected=args.undirected) #elif args.format == "edgelist": # G = graph.load_edgelist(args.input, undirected=args.undirected) #elif args.format == "mat": # G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) if args.format == "w_edgelist": G = graph.load_weighted_edgelist(args.input, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. This version supports only 'w_edgelist'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if True: print("Initailizing...") vertex_counts = G.degree(nodes=G.iterkeys()) #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model = Skipgram(sentences=None, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers, sg=args.sg) print("Walking & Training...") sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks + 1)) for i in xrange(args.number_walks): sys.stderr.write( "\rprogress: %.2f %% [%d/%d] (walk step) " % ((i) * 100. / (args.number_walks + 1), i + 1, args.number_walks + 1)) sys.stderr.flush() walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers) sys.stderr.write( "\rprogress: %.2f %% [%d/%d] (train step) " % ((i + .5) * 100. / (args.number_walks + 1), i + 1, args.number_walks + 1)) sys.stderr.flush() #model.build_vocab(walks) model.train(walks) sys.stderr.write("\rprogress: 100.00 %%\n") sys.stderr.flush() else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): # Build "(Node, Layer)" map if args.floor != "": floorFile = open(args.floor, 'r') for line in floorFile: nd, layer = line.strip().split()[:2] nd = int(nd) layer = int(layer) #print nd, layer if nd not in graph.Graph.nodePos: graph.Graph.nodeList.append(graph.NodeType(nd,layer)) graph.Graph.nodePos[nd] = len(graph.Graph.nodeList)-1 # read input Graph if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) timelog = "" print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) # Centrality calculation >> store in File ''' centrality = nxGraph(args.input) print centrality fo = open("closeness.txt","wb") for k in centrality.keys(): fo.write("{} {}\n".format(k,centrality[k])) fo.close() ''' #exit() lsfile = open(args.LSfile, 'r') calculateBC(lsfile) #exit() #building (Unit)Metapath Table MPList = [] graph.Graph.mpath = [] if args.metapath != "": mpfile = open(args.metapath, 'r') for line in mpfile: MPList.append(int(line.strip().split()[0])) print "(Unit)Metapath: {}".format(MPList) while len(graph.Graph.mpath) < args.walk_length: graph.Graph.mpath.extend(MPList) args.walk_length = len(graph.Graph.mpath) print "(Full)Metapath: {}\nargs.walk_length: {}".format(graph.Graph.mpath, args.walk_length) tStart = time.time() if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random()) tEnd = time.time() print "Walking takes {} seconds".format(round(tEnd - tStart, 3)) timelog = "{}, {}".format( timelog, round(tEnd-tStart, 3) ) print "Number of walks generated: {}".format(len(walks)) tStart = time.time() print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) tEnd = time.time() print "Training takes {} seconds".format(round(tEnd - tStart, 3)) timelog = "{}, {}, ,{}".format( timelog, round(tEnd-tStart, 3), len(walks) ) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output) with open(args.output, 'r') as f: timelog = "{}, {}\n".format( timelog, f.readline().split()[0] ) with open(args.timelog, 'ab') as tl: tl.write(timelog)