def test_feed_dict(self): _, _, test_train, test_test = utils.load_pdata(self.dataset) test_train = test_train[:, 0] test_test = test_test[:, 0] t_train_feed = self.node_feed_dict(test_train) t_test_feed = self.node_feed_dict(test_test) return t_train_feed, t_test_feed
def feature_test(dataset, train_embeddings, test_embeddings): if dataset == 'cora': classes = 7 elif dataset == 'citeseer': classes = 6 elif dataset == 'pubmed': classes = 3 else: raise Exception('Error : wrong dataset name') _, _, train_data, test_data = load_pdata(dataset) test_l = test_data[:, 1] test_label = [] for i in xrange(test_data.shape[0]): temp = [0] * classes temp[test_data[i][1] - 1] += 1 test_label.append(temp) test_label = np.array(test_label) #1000 * 6 train_l = train_data[:, 1] train_label = [] for i in xrange(train_data.shape[0]): temp = [0] * classes temp[train_data[i][1] - 1] += 1 train_label.append(temp) train_label = np.array(train_label) #120 * 6 test_in = np.asarray(test_embeddings) train_in = np.asarray(train_embeddings) y_train_ = sparse.coo_matrix(train_label) y_train = [[] for x in xrange(y_train_.shape[0])] cy = y_train_.tocoo() for i, j in izip(cy.row, cy.col): y_train[i].append(j) assert sum(len(l) for l in y_train) == y_train_.nnz y_test_ = sparse.coo_matrix(test_label) y_test = [[] for x in xrange(y_test_.shape[0])] cy = y_test_.tocoo() for i, j in izip(cy.row, cy.col): y_test[i].append(j) y_train = np.array(y_train) #y_test = np.array(y_test) clf = TopKRanker(LogisticRegression()) clf.fit(train_in, y_train) top_k_list = [len(l) for l in y_test] preds = clf.predict(test_in, top_k_list) acc = accuracy_score(y_test, preds) return acc
def compute_correlation(dataset, embeddings, rpr_matrix): graph, _, _, _ = load_pdata(dataset) eu_dists = [] stru_dists = [] for node in graph: for nei in graph[node]: if node == nei: continue dist_eu = np.linalg.norm(embeddings[node] - embeddings[nei]) dist_stru, _ = fastdtw(embeddings[node], embeddings[nei], radius=1, dist=cost) eu_dists.append(dist_eu) stru_dists.append(dist_stru) pear_rho, pear_p = pearsonr(stru_dists, eu_dists) spea_rho, spea_p = spearmanr(stru_dists, eu_dists) return "P ratio and p: {:.2f} + {:.2f}, S ratio and p: {:.2f} + {:.2f}".format( pear_rho, pear_p, spea_rho, spea_p)
y_test[i].append(j) y_train = np.array(y_train) #y_test = np.array(y_test) clf = TopKRanker(LogisticRegression()) clf.fit(train_in, y_train) top_k_list = [len(l) for l in y_test] preds = clf.predict(test_in, top_k_list) acc = accuracy_score(y_test, preds) return acc if __name__ == '__main__': prefix = sys.argv[1] mask_rate = float(sys.argv[2]) G, feats, train_data, test_data = load_pdata(prefix) features = np.asarray(feats.todense()) test_id = test_data[:, 0] train_id = train_data[:, 0] feat_train = [] feat_test = [] for id_ in train_id: feat_train.append(features[id_]) for id_ in test_id: feat_test.append(features[id_]) acc_f = feature_test(prefix, feat_train, feat_test) print("feats: {:.3f}".format(acc_f))
dic_p = sys.argv[2] emb_dic = {} with open(dic_p, 'r') as f: k = 0 for line in f: if k == 0: k += 1 continue else: word = line.strip().split()[0] word = int(word) emb_dic[word] = k k += 1 classes = int(sys.argv[4]) _, _, train_data, test_data = load_pdata(sys.argv[3]) index = test_data[:, 0] test_l = test_data[:, 1] test_label = [] for i in xrange(test_data.shape[0]): temp = [0] * classes temp[test_data[i][1] - 1] += 1 test_label.append(temp) test_label = np.array(test_label) #1000 * 6 train_index = train_data[:, 0] train_l = train_data[:, 1] train_label = [] for i in xrange(train_data.shape[0]): temp = [0] * classes temp[train_data[i][1] - 1] += 1
def main(): G = read_graph() if FLAGS.preprocess: print(" - Computing Rooted PageRank matrix...") rpr_matrix, pairs, rpr_arg = construct_rpr_matrix(G) utils.dump_to_disk(rpr_arg, './var/' + FLAGS.train_prefix + '_rpr_arg') print(" - RPR matrix completed.") degrees, degree_permuted = utils.create_degree(G) print(" - Dumping degree vectors to disk...") utils.dump_to_disk(degrees, './var/' + FLAGS.train_prefix + '_degrees') utils.dump_to_disk(degree_permuted, './var/' + FLAGS.train_prefix + '_degree_permuted') print(" - Degree vectors dumped.") else: print(" - Loading precomputed Rooted PageRank matrix...") rpr_file = './var/' + FLAGS.train_prefix + '_rpr.mat' rpr_matrix = sio.loadmat(rpr_file)['rpr_matrix'] rpr_arg = utils.load_pkl('./var/' + FLAGS.train_prefix + '_rpr_arg') print(" - RPR matrix loaded.") print(" - Loading Degree vectors...") degrees = utils.load_pkl('./var/' + FLAGS.train_prefix + '_degrees') degree_permuted = utils.load_pkl('./var/' + FLAGS.train_prefix + '_degree_permuted') print(" - Degree vectors loaded.") pairs = [] with open('./var/' + FLAGS.train_prefix + '_normal_walks.txt', 'r') as fp: for line in fp: n_pair = line.split() pairs.append((int(n_pair[0]), int(n_pair[1]))) print(" - Training pairs loaded") placeholders = construct_placeholders() minibatch = MinibatchIterator(G, placeholders, degrees, rpr_matrix, pairs, batchsize=FLAGS.batchsize, stru_rate=FLAGS.stru_rate, dataset=FLAGS.train_prefix) _, features, _, _ = utils.load_pdata(FLAGS.train_prefix) # TODO: maybe can be more efficiently written by sparse multipications features = np.asarray(features.todense()) if FLAGS.PRETRAIN: from gensim.models.keyedvectors import KeyedVectors n2v_embedding = './baselines/{}_{}.embeddings'.format( 'node2vec', FLAGS.train_prefix) n_model = KeyedVectors.load_word2vec_format(n2v_embedding, binary=False) pretrained = np.asarray( [n_model[str(node)] for node in xrange(rpr_matrix.shape[0])]) model = PretrainModel(placeholders, features, pretrained, len(G.nodes()), degree_permuted, rpr_matrix, rpr_arg, dropout=FLAGS.dropout, nodevec_dim=FLAGS.dim, lr=FLAGS.learning_rate, logging=True) else: model = AggregateModel(placeholders, features, len(G.nodes()), degree_permuted, rpr_matrix, rpr_arg, dropout=FLAGS.dropout, nodevec_dim=FLAGS.dim, lr=FLAGS.learning_rate, logging=True) config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True config.allow_soft_placement = True sess = tf.Session(config=config) saver = tf.train.Saver(max_to_keep=5) merged = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) # Init variables sess.run(tf.global_variables_initializer()) # Train model total_steps = 0 average_time = 0.0 average_test = 0.0 test_steps = 0 epoch_test_acc = [0.0] for epoch in xrange(FLAGS.epoch): minibatch.shuffle() _iter = 0 print("Epoch : %02d" % (epoch + 1), "Batchs per epoch : %04d" % (len(pairs) / FLAGS.batchsize)) while not minibatch.end(): feed_dict = minibatch.next_minibatch_feed_dict() t = time.time() # training step outs = sess.run( [merged, model.opt_op, model.loss, model.embeddings], feed_dict=feed_dict) train_cost = outs[2] average_time = (average_time * total_steps + time.time() - t) / (total_steps + 1) if _iter % FLAGS.verbose == 0: if FLAGS.CORR: all_feed = minibatch.all_feed_dict() out = sess.run([ model.train_inputs_all, model.train_inputs_f, model.embed, model.loss ], feed_dict=all_feed) str_corr = test.compute_correlation( FLAGS.train_prefix, out[1], rpr_matrix) print("Epoch: ", '%02d' % (epoch + 1), "iter: ", '%03d' % _iter, "loss: ", "{:.3f}".format(train_cost), "corr: ", str_corr, "train time: ", "{:.3f}".format(average_time)) else: train_feed, test_feed = minibatch.test_feed_dict() out_train = sess.run([ model.train_inputs_all, model.train_inputs_f, model.embed ], feed_dict=train_feed) t1 = time.time() out_test = sess.run([ model.train_inputs_all, model.train_inputs_f, model.embed ], feed_dict=test_feed) average_test = (average_test * test_steps + time.time() - t1) / (test_steps + 1) test_steps += 1 acc_f = test.feature_test(FLAGS.train_prefix, out_train[1], out_test[1]) epoch_test_acc.append(acc_f) print("Epoch: ", '%02d' % (epoch + 1), "iter: ", '%03d' % _iter, "loss: ", "{:.3f}".format(train_cost), "now acc: ", "{:.3f}".format(epoch_test_acc[-1]), "best acc: ", "{:.3f}".format(max(epoch_test_acc)), "train time: ", "{:.3f}".format(average_time), "test time: ", "{:.3f}".format(average_test)) _iter += 1 total_steps += 1 if epoch % FLAGS.save_per_epoch: saver.save(sess, os.path.join(log_dir(), 'model.ckpt'), epoch) print("Optimization finished !")