def main(args): t1 = time.time() g = Graph() singluar_node_file = "singluar_nodes.txt" print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) if args.method == 'node2vec': model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.representation_size, workers=args.workers, p=args.p, q=args.q, window=args.window_size) elif args.method == 'line': if args.label_file and not args.no_auto_save: model = line.LINE(g, epoch = args.epochs, rep_size=args.representation_size, order=args.order, label_file=args.label_file, clf_ratio=args.clf_ratio) else: model = line.LINE(g, epoch = args.epochs, rep_size=args.representation_size, order=args.order) elif args.method == 'deepWalk': model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.representation_size, workers=args.workers, window=args.window_size, dw=True) elif args.method == 'tadw': # assert args.label_file != '' assert args.feature_file != '' # g.read_node_label(args.label_file) g.read_node_features(args.feature_file) fout = open(singluar_node_file, "w+") for node_idx in g.sgl_node_list: fout.write("{}\n".format(node_idx)) fout.close() model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) # model = tadw_gpu.TADW_GPU(graph=g, dim=args.representation_size, lamb=args.lamb) elif args.method == 'gcn': assert args.label_file != '' assert args.feature_file != '' g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.clf_ratio) elif args.method == 'grarep': model = GraRep(graph=g, Kstep=args.kstep, dim=args.representation_size) t2 = time.time() print("time: ", t2-t1) if args.method != 'gcn': print("Saving embeddings...") model.save_embeddings(args.output) if args.label_file and args.method != 'gcn': vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio*100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)
def main(args): g = Graph( ) # see graph.py for commonly-used APIs and use g.G to access NetworkX APIs print(f'Summary of all settings: {args}') # ---------------------------------------STEP1: load data----------------------------------------------------- print('\nSTEP1: start loading data......') t1 = time.time() # load graph structure info; by defalt, treat as undirected and unweighted graph ------ if args.graph_format == 'adjlist': g.read_adjlist(path=args.graph_file, directed=args.directed) elif args.graph_format == 'edgelist': g.read_edgelist(path=args.graph_file, weighted=args.weighted, directed=args.directed) # load node attribute info ------ is_ane = (args.method == 'abrw' or args.method == 'tadw' or args.method == 'gcn' or args.method == 'sagemean' or args.method == 'sagegcn' or args.method == 'attrpure' or args.method == 'attrcomb' or args.method == 'asne' or args.method == 'aane') if is_ane: assert args.attribute_file != '' g.read_node_attr(args.attribute_file) # load node label info------ t2 = time.time() print(f'STEP1: end loading data; time cost: {(t2-t1):.2f}s') # ---------------------------------------STEP2: prepare data---------------------------------------------------- print('\nSTEP2: start preparing data for link pred task......') t1 = time.time() test_node_pairs = [] test_edge_labels = [] if args.task == 'lp' or args.task == 'lp_and_nc': edges_removed = g.remove_edge(ratio=args.link_remove) num_test_links = 0 limit_percentage = 0.2 # at most, use 0.2 randomly removed links for testing num_test_links = int( min(len(edges_removed), len(edges_removed) / args.link_remove * limit_percentage)) edges_removed = random.sample(edges_removed, num_test_links) test_node_pairs, test_edge_labels = generate_edges_for_linkpred( graph=g, edges_removed=edges_removed, balance_ratio=1.0) t2 = time.time() print(f'STEP2: end preparing data; time cost: {(t2-t1):.2f}s') # -----------------------------------STEP3: upstream embedding task------------------------------------------------- print('\nSTEP3: start learning embeddings......') print(f'the graph: {args.graph_file}; \nthe model used: {args.method}; \ \nthe # of edges used during embedding (edges maybe removed if lp task): {g.get_num_edges()}; \ \nthe # of nodes: {g.get_num_nodes()}; \nthe # of isolated nodes: {g.get_num_isolates()}; \nis directed graph: {g.get_isdirected()}' ) t1 = time.time() model = None if args.method == 'abrw': from libnrl import abrw # ANE method; (Adaptive) Attributed Biased Random Walk model = abrw.ABRW(graph=g, dim=args.dim, topk=args.ABRW_topk, beta=args.ABRW_beta, beta_mode=args.ABRW_beta_mode, alpha=args.ABRW_alpha, number_walks=args.number_walks, walk_length=args.walk_length, window=args.window_size, workers=args.workers) elif args.method == 'aane': from libnrl import aane # ANE method model = aane.AANE( graph=g, dim=args.dim, lambd=args.AANE_lamb, rho=args.AANE_rho, maxiter=args.AANE_maxiter, mode='comb') # mode: 'comb' struc and attri or 'pure' struc elif args.method == 'tadw': from libnrl import tadw # ANE method model = tadw.TADW(graph=g, dim=args.dim, lamb=args.TADW_lamb, maxiter=args.TADW_maxiter) elif args.method == 'attrpure': from libnrl import attrpure # NE method simply use svd or pca for dim reduction model = attrpure.ATTRPURE(graph=g, dim=args.dim, mode='pca') # mode: pca or svd elif args.method == 'attrcomb': from libnrl import attrcomb # ANE method model = attrcomb.ATTRCOMB( graph=g, dim=args.dim, comb_with='deepwalk', number_walks=args.number_walks, walk_length=args.walk_length, window=args.window_size, workers=args.workers, comb_method=args.AttrComb_mode ) # comb_method: concat, elementwise-mean, elementwise-max elif args.method == 'deepwalk': from libnrl import node2vec # PNE method; including deepwalk and node2vec model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim, workers=args.workers, window=args.window_size, dw=True) elif args.method == 'node2vec': from libnrl import node2vec # PNE method; including deepwalk and node2vec model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dim, workers=args.workers, window=args.window_size, p=args.Node2Vec_p, q=args.Node2Vec_q) elif args.method == 'grarep': from libnrl import grarep # PNE method model = grarep.GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim) elif args.method == 'line': # if auto_save, use label to justifiy the best embeddings by looking at micro / macro-F1 score from libnrl import line # PNE method model = line.LINE(graph=g, epoch=args.epochs, rep_size=args.dim, order=args.LINE_order, batch_size=args.batch_size, negative_ratio=args.LINE_negative_ratio, label_file=args.label_file, clf_ratio=args.label_reserved, auto_save=True, best='micro') elif args.method == 'asne': from libnrl import asne # ANE method model = asne.ASNE(graph=g, dim=args.dim, alpha=args.ASNE_lamb, learning_rate=args.learning_rate, batch_size=args.batch_size, epoch=args.epochs, n_neg_samples=10) elif args.method == 'sagemean': # parameters for graphsage models are in 'graphsage' -> '__init__.py' from libnrl.graphsage import graphsageAPI # ANE method model = graphsageAPI.graphSAGE(graph=g, sage_model='mean', is_supervised=False) elif args.method == 'sagegcn': # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v from libnrl.graphsage import graphsageAPI # ANE method model = graphsageAPI.graphSAGE(graph=g, sage_model='gcn', is_supervised=False) else: print('method not found...') exit(0) t2 = time.time() print(f'STEP3: end learning embeddings; time cost: {(t2-t1):.2f}s') if args.save_emb: #model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime())) model.save_embeddings(args.emb_file) print(f'Save node embeddings in file: {args.emb_file}') # ---------------------------------------STEP4: downstream task----------------------------------------------- print('\nSTEP4: start evaluating ......: ') t1 = time.time() vectors = model.vectors del model, g # ------lp task if args.task == 'lp' or args.task == 'lp_and_nc': print( f'Link Prediction task; the number of testing links {len(test_edge_labels)} i.e. at most 2*0.2*all_positive_links)' ) ds_task = lpClassifier( vectors=vectors ) # similarity/distance metric as clf; basically, lp is a binary clf probelm ds_task.evaluate(test_node_pairs, test_edge_labels) # ------nc task if args.task == 'nc' or args.task == 'lp_and_nc': X, Y = read_node_label_downstream(args.label_file) print( f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%' ) ds_task = ncClassifier( vectors=vectors, clf=LogisticRegression() ) # use Logistic Regression as clf; we may choose SVM or more advanced ones ds_task.split_train_evaluate(X, Y, args.label_reserved) t2 = time.time() print(f'STEP4: end evaluating; time cost: {(t2-t1):.2f}s')
args.input = os.path.join(os.getcwd(), 'data', dataset, '{}_{}_edges.txt'.format(dataset, train_fts_ratio)) args.weighted = False args.directed = False args.epochs = 1000 if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) if args.method == 'node2vec': model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.representation_size, workers=args.workers, p=args.p, q=args.q, window=args.window_size) elif args.method == 'line': if args.label_file and not args.no_auto_save: model = line.LINE(g, epoch = args.epochs, rep_size=args.representation_size, order=args.order, label_file=args.label_file, clf_ratio=args.clf_ratio) else: model = line.LINE(g, epoch = args.epochs, rep_size=args.representation_size, order=args.order) elif args.method == 'deepWalk': model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.representation_size, workers=args.workers, window=args.window_size, dw=True) elif args.method == 'tadw': assert args.label_file != '' assert args.feature_file != '' g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) elif args.method == 'gcn': assert args.label_file != '' assert args.feature_file != ''
def main(args): t1 = time.time() g = Graph() print "Reading..." if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) if args.method == 'node2vec': model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.representation_size, workers=args.workers, p=args.p, q=args.q, window=args.window_size) elif args.method == 'line': if args.label_file: model = line.LINE(g, lr=args.lr, batch_size=args.batch_size, epoch=args.epochs, rep_size=args.representation_size, order=args.order, label_file=args.label_file, clf_ratio=args.clf_ratio, auto_stop=args.no - auto_stop) else: model = line.LINE(g, lr=args.lr, batch_size=args.batch_size, epoch=args.epochs, rep_size=args.representation_size, order=args.order) elif args.method == 'deepWalk': model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.representation_size, workers=args.workers, window=args.window_size, dw=True) elif args.method == 'tadw': assert args.label_file != '' assert args.feature_file != '' g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) elif args.method == 'gcn': assert args.label_file != '' assert args.feature_file != '' g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = gcnAPI.GCN(graph=g, dropout=args.dropout, weight_decay=args.weight_decay, hidden1=args.hidden, epochs=args.epochs, clf_ratio=args.clf_ratio) elif args.method == 'grarep': model = GraRep(graph=g, Kstep=args.kstep, dim=args.representation_size) t2 = time.time() print t2 - t1 if args.method != 'gcn': print "Saving embeddings..." model.save_embeddings(args.output) if args.label_file and args.method != 'gcn': vectors = model.vectors X, Y = read_node_label(args.label_file) print "Training classifier using {:.2f}% nodes...".format( args.clf_ratio * 100) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)