def main(args): t1 = time.time() g = Graph() print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) t2 = time.time() print(t2 - t1) print("Saving embeddings...") model.save_embeddings(args.output) vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)
def evaluate_embeddings(embeddings): X, Y = read_node_label('../data/wiki/wiki_labels.txt') tr_frac = 0.8 print("Training classifier using {:.2f}% nodes...".format( tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def node_classification(session, bs, seqne, sequences, seq_len, node_n, samp_idx, label, ratio): enc_sum_dict = {} node_cnt = {} s_idx, e_idx = 0, bs while e_idx < len(sequences): batch_enc = session.run(seqne.encoder_output, feed_dict={seqne.input_seqs: sequences[s_idx: e_idx], seqne.dropout: 0, seqne.keep_prob: 0}) enc_sum_dict, node_cnt = reduce_seq2seq_hidden_add(enc_sum_dict, node_cnt, sequences, batch_enc.astype('float32'), seq_len, s_idx) s_idx, e_idx = e_idx, e_idx + bs if s_idx < len(sequences): batch_enc = session.run(seqne.encoder_output, feed_dict={seqne.input_seqs: sequences[s_idx: len(sequences)], seqne.dropout: 0, seqne.keep_prob: 0}) enc_sum_dict, node_cnt = reduce_seq2seq_hidden_add(enc_sum_dict, node_cnt, sequences, batch_enc.astype('float32'), seq_len, s_idx) node_enc_mean = reduce_seq2seq_hidden_avg(sum_dict=enc_sum_dict, count_dict=node_cnt, node_num=node_n) lr = Classifier(vectors=node_enc_mean, clf=LogisticRegression()) f1_micro, f1_macro = lr.split_train_evaluate(samp_idx, label, ratio) return f1_micro
def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file = None, clf_ratio = 0.5, auto_stop = True): self.rep_size = rep_size self.order = order self.best_result = 0 self.vectors = {} if order == 3: self.model1 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=1) self.model2 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=2) for i in range(epoch): self.model1.train_one_epoch() self.model2.train_one_epoch() if label_file: self.get_embeddings() X, Y = read_node_label(label_file) # print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100) clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) if result['micro'] < self.best_result and auto_stop: self.vectors = self.last_vectors print 'Auto stop!' return elif result['micro'] > self.best_result: self.best_result = result['micro'] else: self.model = _LINE(graph, rep_size, batch_size, negative_ratio, order=self.order) for i in range(epoch): self.model.train_one_epoch() if label_file: self.get_embeddings() X, Y = read_node_label(label_file) # print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100) clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) if result['micro'] < self.best_result and auto_stop: self.vectors = self.last_vectors print 'Auto stop!' return elif result['micro'] > self.best_result: self.best_result = result['micro'] self.get_embeddings()
def classify(vectors, args): if not os.path.isfile(args.classifydir + '_labels.txt'): return defaultdict(lambda: 0) X, Y = read_node_label(args.classifydir + '_labels.txt') print("Training classifier using {:.2f}% nodes...".format( args.train_percent * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression(solver="lbfgs", max_iter=4000)) scores = clf.split_train_evaluate(X, Y, args.train_percent) return scores
def main(args): node_embeddings = load_embeddings(args.embedding_file) if args.label_file: labels = read_node_label(args.label_file) if args.modularity: print("Modularity") modularity(args, node_embeddings, args.min_k, args.max_k) if args.reconstruction: print("Graph reconstruction") reconstr(args, node_embeddings, args.k_nbrs) if args.clustering: print("Clustering") clustering(node_embeddings, labels, args.exp_times) if args.link_prediction: print("Link prediction") link_prediction(args.input, node_embeddings) if args.classification: X = list(labels.keys()) Y = list(labels.values()) print("Node classification") clf_ratio_list = args.clf_ratio.strip().split(',') result_list = {} train_ratio = np.asarray(range(1, 10)) * .1 for clf_ratio in train_ratio: # clf_ratio_list: result_per_test = [] for ti in range(args.exp_times): clf = Classifier(vectors=node_embeddings, clf=LogisticRegression()) myresult = clf.split_train_evaluate(X, Y, float(clf_ratio)) result_per_test.append(myresult) result_list[clf_ratio] = result_per_test print('-------------------') for clf_ratio in train_ratio: print('Train percent:', clf_ratio) results = result_list[clf_ratio] for index, result in enumerate(results): print('Shuffle #%d: ' % (index + 1), result) avg_score = defaultdict(float) for score_dict in results: for metric, score in score_dict.items(): avg_score[metric] += score for metric in avg_score: avg_score[metric] /= len(results) print('Average score:', dict(avg_score)) print('-------------------')
def main(args): print("xnetmf", "begin...") t1 = time.time() print("Reading...") nx_graph = nx.read_edgelist(agrs.input, nodetype=int, comments="%") adj_matrix = nx.adjacency_matrix(nx_graph).todense() print(adj_matrix) g = Graph(adj_matrix) rep_method = RepMethod( max_layer=2 ) # Learn representations with xNetMF. Can adjust parameters (e.g. as in REGAL) representations = src.xnetmf.get_representations(g, rep_method) print(representations) print(representations.shape) print("TAWD", "begin...") print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = xtadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) t2 = time.time() print(t2 - t1) print("Saving embeddings...") model.save_embeddings(args.output) vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)
def node_classification( embeddings, label_path, name, size): X, Y = read_node_label( embeddings, label_path,) f_c=open('results/%s_classification_%d.txt'%(name, size), 'w') all_ratio=[] for tr_frac in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: print(" Training classifier using {:.2f}% nodes...".format(tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression(), name=name) results= clf.split_train_evaluate(X, Y, tr_frac) avg='macro' f_c.write(name+' train percentage: '+ str(tr_frac)+ ' F1-'+avg+ ' '+ str('%0.5f'%results[avg])) all_ratio.append(results[avg]) f_c.write('\n')
# results_file = open('0127_best_result_wiki.txt', 'w') # results_file.write(" %s\n" % best_result) # # np.savetxt('best_result_wiki.out', best_result, delimiter='\t') # X is an array clf_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] node_size = len(vectors) train_x = np.array([vectors[x] for x in X]) reshaped_train_x = train_x.reshape((train_x.shape[0], args.kstep, node_size)) train_x = low_encoder.predict(reshaped_train_x) for clf_one in clf_list: print "Training classifier using {:.2f}% nodes...".format(clf_one * 100) clf = Classifier(vectors=train_x, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, clf_one) # y_lable = np.array(Y) # print(train_x.shape) # print(type(train_x)) # print(type(y_lable)) # # np.savetxt('train_x.out', train_x, delimiter='\t') # X is an array # np.savetxt('train_Y.out', y_lable.astype(int), delimiter='\t') # X is an array # # # print(results) # results_file = open('citeseer_result_0.3.txt', 'w') # for item in results: # results_file.write("%s\n" % item)
Author : haxu date: 2019/4/3 ------------------------------------------------- Change Activity: 2019/4/3: ------------------------------------------------- """ __author__ = 'haxu' import networkx as nx from deepwalk import DeepWalk from classify import read_node_label, Classifier from sklearn.linear_model import LogisticRegression if __name__ == '__main__': G = nx.read_edgelist('../data/Wiki_edgelist.txt', create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) model = DeepWalk(G, walk_length=30, num_walks=80, workers=4) model.train(window_size=5, iter=3) embeddings = model.get_embeddings() X, Y = read_node_label('../data/wiki_labels.txt') tr_frac = 0.8 clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
for b in tqdm(xrange(batches_per_epoch)): #sen_input = sen_gen.next() neg_input = neg_gen.next() sen_input2 = sen_gen2.next() sen_input = sen_input2[:, 0:-1].reshape( (args.batch_size, args.kstep, node_size)) neg_input = neg_input.reshape( (args.batch_size, args.neg_size, args.kstep, node_size)) batch_loss, batch_max_margin_loss = model_auto.train_on_batch( [sen_input, neg_input], np.ones((args.batch_size, 1))) loss += batch_loss / batches_per_epoch max_margin_loss += batch_max_margin_loss / batches_per_epoch tr_time = time.time() - t0 ############################################################################################################################### ## classification evaluation node_size = len(vectors) train_x = np.array([vectors[x] for x in X]) reshaped_train_x = train_x.reshape((train_x.shape[0], args.kstep, node_size)) train_x = low_encoder.predict(reshaped_train_x) print "Training classifier using {:.2f}% nodes...".format(args.clf_ratio * 100) clf = Classifier(vectors=train_x, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)
def main(args): t1 = time.time() g = Graph() print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) # if args.method == 'node2vec': # model = node2vec.Node2vec(graph=g, path_length=args.walk_length, # num_paths=args.number_walks, dim=args.representation_size, # workers=args.workers, p=args.p, q=args.q, window=args.window_size) # elif args.method == 'line': # if args.label_file and not args.no_auto_save: # model = line.LINE(g, epoch=args.epochs, rep_size=args.representation_size, order=args.order, # label_file=args.label_file, clf_ratio=args.clf_ratio) # else: # model = line.LINE(g, epoch=args.epochs, # rep_size=args.representation_size, order=args.order) # elif args.method == 'deepWalk': # model = node2vec.Node2vec(graph=g, path_length=args.walk_length, # num_paths=args.number_walks, dim=args.representation_size, # workers=args.workers, window=args.window_size, dw=True) # elif args.method == 'tadw': # # assert args.label_file != '' # assert args.feature_file != '' # g.read_node_label(args.label_file) # g.read_node_features(args.feature_file) # model = tadw.TADW( # graph=g, dim=args.representation_size, lamb=args.lamb) # elif args.method == 'gcn': # assert args.label_file != '' # assert args.feature_file != '' # g.read_node_label(args.label_file) # g.read_node_features(args.feature_file) # model = gcnAPI.GCN(graph=g, dropout=args.dropout, # weight_decay=args.weight_decay, hidden1=args.hidden, # epochs=args.epochs, clf_ratio=args.clf_ratio) # elif args.method == 'grarep': # model = GraRep(graph=g, Kstep=args.kstep, dim=args.representation_size) # elif args.method == 'lle': # model = lle.LLE(graph=g, d=args.representation_size) # elif args.method == 'hope': # model = hope.HOPE(graph=g, d=args.representation_size) if args.method == 'sdne': encoder_layer_list = ast.literal_eval(args.encoder_list) model = sdne.SDNE(g, encoder_layer_list=encoder_layer_list, alpha=args.alpha, beta=args.beta, nu1=args.nu1, nu2=args.nu2, batch_size=args.bs, epoch=args.epochs, learning_rate=args.lr) elif args.method == 'sdne_binary_loss': encoder_layer_list = ast.literal_eval(args.encoder_list) model = sdne.SDNE(g, encoder_layer_list=encoder_layer_list, alpha=args.alpha, beta=args.beta, nu1=args.nu1, nu2=args.nu2, batch_size=args.bs, epoch=args.epochs, learning_rate=args.lr) elif args.method == 'sdne_meta_path': encoder_layer_list = ast.literal_eval(args.encoder_list) model = sdne.SDNE(g, encoder_layer_list=encoder_layer_list, alpha=args.alpha, beta=args.beta, nu1=args.nu1, nu2=args.nu2, batch_size=args.bs, epoch=args.epochs, learning_rate=args.lr) # elif args.method == 'lap': # model = lap.LaplacianEigenmaps(g, rep_size=args.representation_size) # elif args.method == 'gf': # model = gf.GraphFactorization(g, rep_size=args.representation_size, # epoch=args.epochs, learning_rate=args.lr, weight_decay=args.weight_decay) t2 = time.time() print('cost time is : {}'.format(t2 - t1)) if args.method != 'gcn': print("Saving embeddings...") model.save_embeddings(args.output) if args.label_file and args.method != 'gcn': vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format( args.clf_ratio * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)