def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) else: raise Exception( "unknown file format: '%s'. valid formats: 'adjlist', 'edgelist'" % args.format) print("number of nodes: {}".format(len(G.nodes()))) # .format 格式化字符串(取代{}) num_walks = len(G.nodes()) * args.number_walks # 每个节点有多个walks print("number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("data size (walk*length): {}".format(data_size)) print("walking...") walk_file = walks.write_walks_to_disk(G, args.output, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) model = Word2Vec(walk_file, args.output, emb_dimension=args.representation_size, window_size=args.window_size, min_count=0) print("Training...") model.skip_gram_train()
def process(args): # Create a graph from the training set nodedict = graph.records_to_graph() # Build the model using DeepWalk and Word2Vec G = graph.load_adjacencylist("out.adj", undirected=True) walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) # Perform some evaluation of the model on the test dataset with open("./data/test_user_ratings.dat") as fin: fin.readline() groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating) tr = [int(round(float(g[2]))) for g in groundtruth] pr = [ predict_rating(model, nodedict, "u" + g[0], "m" + g[1]) for g in groundtruth ] print("MSE = %f" % mean_squared_error(tr, pr)) print("accuracy = %f" % accuracy_score(tr, pr)) cm = confusion_matrix(tr, pr, labels=range(1, 6)) print(cm)
def process(args): # Create a graph from the training set nodedict = graph.records_to_graph() # print(args) # Build the model using DeepWalk and Word2Vec G = graph.load_adjacencylist("out.adj", undirected=True) # YOUR CODE HERE # print(args.number_walks) # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0)) walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0,rand=random.Random(0)) print len(walk) model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) print model # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1) # Perform some evaluation of the model on the test dataset with open("./data/test_user_ratings.dat") as fin: fin.next() groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating) tr = [int(round(float(g[2]))) for g in groundtruth] # print(groundtruth) pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth] # print(pr) print "MSE = %f" % mean_squared_error(tr, pr) print "accuracy = %f" % accuracy_score(tr, pr) cm = confusion_matrix(tr, pr, labels=range(1,6)) print cm
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output, binary=False) print('saved!')
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) # G = graphConstruction.buildGraphAPA() print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) print("Walking...") start = time.time() walks_filebase = args.output + ".txt" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) # print("Counting vertex frequency...") # if not args.vertex_freq_degree: # vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) # else: # # use degree distribution for frequency in tree # vertex_counts = G.degree(nodes=G.iterkeys()) end = time.time() exe_time = end - start print("--------- walking time: {:.5f} -----------".format(exe_time))
def process(args): # Create a graph from the training set nodedict = graph.records_to_graph() # print(args) # Build the model using DeepWalk and Word2Vec G = graph.load_adjacencylist("out.adj", undirected=True) # YOUR CODE HERE # print(args.number_walks) # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0)) walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0, rand=random.Random(0)) print len(walk) model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) print model # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1) # Perform some evaluation of the model on the test dataset with open("./data/test_user_ratings.dat") as fin: fin.next() groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating) tr = [int(round(float(g[2]))) for g in groundtruth] # print(groundtruth) pr = [ predict_rating(model, nodedict, "u" + g[0], "m" + g[1]) for g in groundtruth ] # print(pr) print "MSE = %f" % mean_squared_error(tr, pr) print "accuracy = %f" % accuracy_score(tr, pr) cm = confusion_matrix(tr, pr, labels=range(1, 6)) print cm
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, test_links_ratio=args.test_links, test_links_file=args.test_links_file, train_links_file=args.train_links_file) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) if args.heuristic_wrb_for_wbr is not None: wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr)) print(wrb, err) return if (args.weighted is not None) and (args.weighted != 'unweighted'): G = graph.set_weights(G, args.weighted) if args.just_write_graph: with open('wgraph.out', 'w') as fout: if args.weighted == 'unweighted': for v in G: s = len(G[v]) for u in G[v]: fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n') elif args.weighted.startswith('random_walk'): for v in G: for u, w in zip(G[v], G.edge_weights[v]): fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n') else: raise Exception('just-write-graph is not supported for this weighting method') return None num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output)
import numpy as np from numpy import linalg as LA parser = argparse.ArgumentParser(description='Criar bolas por vértices.') parser.add_argument('--grafoModel', nargs='?', required=True, help='Input graph file') parser.add_argument('--grafoData', nargs='?', required=True, help='Input graph file') args = parser.parse_args() Gm = graph.load_adjacencylist(args.grafoModel, undirected=True) Gd = graph.load_adjacencylist(args.grafoData, undirected=True) Am = editDistance.binaryMatrix(Gm) Ad = editDistance.binaryMatrix(Gd) Dm = editDistance.diagonalDegreeMatrixFromBinaryMatrix(Am) Dd = editDistance.diagonalDegreeMatrixFromBinaryMatrix(Ad) #d = graph.diagonalDegreeMatrix(m) print "Matriz binária (Am)" print Am print "Matriz binária (Ad)" print Ad print "DiagonalDegreeMatrix (Dm)" print Dm print "DiagonalDegreeMatrix (Dd)" print Dd
import graph import algoritmos import calculos import argparse parser = argparse.ArgumentParser(description='Executa testes arvores.') parser.add_argument('--grafo1', nargs='?', required=True, help='Input graph file') parser.add_argument('--grafo2', nargs='?', required=True, help='Input graph file') args = parser.parse_args() print (" - Carregando matriz de adjacência para Grafo (na memória)...") G1 = graph.load_adjacencylist(args.grafo1,undirected=True) print (" - Carregando matriz de adjacência para Grafo (na memória)...") G2 = graph.load_adjacencylist(args.grafo2,undirected=True) print (" - Convertendo grafo para Dict (na memória)...") dictG1 = G1.gToDict() dictG2 = G2.gToDict() print ("Criando listas...") l1,v1 = calculos.geraListas(dictG1,1) l2,v2 = calculos.geraListas(dictG2,1) print ("Listas v1:") calculos.printDataVertice(l1) print ("Listas v2:")
#resultadosConsolidados = [] #for r in resultados: # resultadosConsolidados.append({'label': r[1], 'arvore': r[0]}) #return resultadosConsolidados rand = random.Random() parser = argparse.ArgumentParser(description='Criar bolas por vértices.') parser.add_argument('--grafo', nargs='?', required=True, help='Input graph file') args = parser.parse_args() print " - Carregando matriz de adjacência para Grafo (na memória)..." G = graph.load_adjacencylist(args.grafo, undirected=True) print " - Convertendo grafo para Dict (na memória)..." dictG = G.gToDict() print " - Gerando árvore..." t0 = time() montaArvores(dictG) t1 = time() print('Árvores geradas em {}m'.format((t1 - t0) / 60))
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) if (os.path.isfile(format(args.excludlist))): #num_exlud = number_excluded_nodes(args.excludlist) list_exclud = open(args.excludlist).readlines() list_exclud = [int(x) for x in list_exclud] list_exclud = set(list_exclud) num_exlud = len(set(list_exclud)) else: num_exlud = 0 list_exclud = [] if (num_exlud > 0): print("Number of nodes excluded from the walk: {}".format(num_exlud)) #num_walks = (len(G.nodes()) - num_exlud) * args.number_walks num_walks = (len(G.nodes()) - num_exlud) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, list_exclud=list_exclud, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, list_exclud, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.wv.save_word2vec_format(args.output)
import graph import algoritmos import argparse import numpy as np from numpy import linalg as LA parser = argparse.ArgumentParser(description='Criar bolas por vértices.') parser.add_argument('--input', nargs='?', required=True, help='Input graph file') parser.add_argument('--deepth', nargs='?', required=True, type=int, help='Deepth') args = parser.parse_args() G = graph.load_adjacencylist(args.input, undirected=True) print G.printAdjList() #print "Com arestas:" #algoritmos.montaBolaComArestasUltimaCamada(G,3,args.deepth).printAdjList() #print "Sem arestas:" #algoritmos.montaBolaSemArestasUltimaCamada(G,3,args.deepth).printAdjList()