def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile( args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format( data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): #if args.format == "adjlist": # G = graph.load_adjacencylist(args.input, undirected=args.undirected) #elif args.format == "edgelist": # G = graph.load_edgelist(args.input, undirected=args.undirected) #elif args.format == "mat": # G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) if args.format == "w_edgelist": G = graph.load_weighted_edgelist(args.input, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. This version supports only 'w_edgelist'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if True: print("Initailizing...") vertex_counts = G.degree(nodes=G.iterkeys()) #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model = Skipgram(sentences=None, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers, sg=args.sg) print("Walking & Training...") sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks+1)) for i in xrange(args.number_walks): sys.stderr.write("\rprogress: %.2f %% [%d/%d] (walk step) " % ((i)*100./(args.number_walks+1), i+1, args.number_walks+1)) sys.stderr.flush() walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers) sys.stderr.write("\rprogress: %.2f %% [%d/%d] (train step) " % ((i+.5)*100./(args.number_walks+1), i+1, args.number_walks+1)) sys.stderr.flush() #model.build_vocab(walks) model.train(walks) sys.stderr.write("\rprogress: 100.00 %%\n") sys.stderr.flush() else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(edges_list, undirected=True, number_walks=10, walk_length=40, window_size=5, workers=1, dimensions=64, max_memory_data_size=1000000000, seed=0, vertex_freq_degree=False): G = graph.load_edgelist(edges_list, undirected=undirected) #print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * number_walks #print("Number of walks: {}".format(num_walks)) data_size = num_walks * walk_length #print("Data size (walks*length): {}".format(data_size)) if data_size < max_memory_data_size: # print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed)) # print("Training...") model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, workers=workers) else: # print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, max_memory_data_size)) # print("Walking...") walks_filebase = "karate.embeddings" + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed), num_workers=workers) # print("Counting vertex frequency...") if not vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) # print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=dimensions, window=window_size, min_count=0, workers=workers) #model.save_word2vec_format("karate.embeddings") return model
def generator(): return serialized_walks.combine_files_iter(walk_files)
def process(args): #if args.format == "adjlist": # G = graph.load_adjacencylist(args.input, undirected=args.undirected) #elif args.format == "edgelist": # G = graph.load_edgelist(args.input, undirected=args.undirected) #elif args.format == "mat": # G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) if args.format == "w_edgelist": G = graph.load_weighted_edgelist(args.input, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. This version supports only 'w_edgelist'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if True: print("Initailizing...") vertex_counts = G.degree(nodes=G.iterkeys()) #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model = Skipgram(sentences=None, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers, sg=args.sg) print("Walking & Training...") sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks + 1)) for i in xrange(args.number_walks): sys.stderr.write( "\rprogress: %.2f %% [%d/%d] (walk step) " % ((i) * 100. / (args.number_walks + 1), i + 1, args.number_walks + 1)) sys.stderr.flush() walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers) sys.stderr.write( "\rprogress: %.2f %% [%d/%d] (train step) " % ((i + .5) * 100. / (args.number_walks + 1), i + 1, args.number_walks + 1)) sys.stderr.flush() #model.build_vocab(walks) model.train(walks) sys.stderr.write("\rprogress: 100.00 %%\n") sys.stderr.flush() else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): # Build "(Node, Layer)" map if args.floor != "": floorFile = open(args.floor, 'r') for line in floorFile: nd, layer = line.strip().split()[:2] nd = int(nd) layer = int(layer) #print nd, layer if nd not in graph.Graph.nodePos: graph.Graph.nodeList.append(graph.NodeType(nd,layer)) graph.Graph.nodePos[nd] = len(graph.Graph.nodeList)-1 # read input Graph if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) timelog = "" print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) # Centrality calculation >> store in File ''' centrality = nxGraph(args.input) print centrality fo = open("closeness.txt","wb") for k in centrality.keys(): fo.write("{} {}\n".format(k,centrality[k])) fo.close() ''' #exit() lsfile = open(args.LSfile, 'r') calculateBC(lsfile) #exit() #building (Unit)Metapath Table MPList = [] graph.Graph.mpath = [] if args.metapath != "": mpfile = open(args.metapath, 'r') for line in mpfile: MPList.append(int(line.strip().split()[0])) print "(Unit)Metapath: {}".format(MPList) while len(graph.Graph.mpath) < args.walk_length: graph.Graph.mpath.extend(MPList) args.walk_length = len(graph.Graph.mpath) print "(Full)Metapath: {}\nargs.walk_length: {}".format(graph.Graph.mpath, args.walk_length) tStart = time.time() if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random()) tEnd = time.time() print "Walking takes {} seconds".format(round(tEnd - tStart, 3)) timelog = "{}, {}".format( timelog, round(tEnd-tStart, 3) ) print "Number of walks generated: {}".format(len(walks)) tStart = time.time() print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) tEnd = time.time() print "Training takes {} seconds".format(round(tEnd - tStart, 3)) timelog = "{}, {}, ,{}".format( timelog, round(tEnd-tStart, 3), len(walks) ) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output) with open(args.output, 'r') as f: timelog = "{}, {}\n".format( timelog, f.readline().split()[0] ) with open(args.timelog, 'ab') as tl: tl.write(timelog)