def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = torch.Tensor( [float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()]) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) hidden_emb = None for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() recovered, mu, logvar = model(features, adj_norm) loss = loss_function(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score))
def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features, y_test, tx, ty, test_maks, true_labels = load_data( args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Before proceeding further, make the structure for doing deepWalk if args.dw == 1: print('Using deepWalk regularization...') G = load_edgelist_from_csr_matrix(adj_orig, undirected=True) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.model == 'gcn_vae': model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) else: model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.dw == 1: sg = SkipGram(args.hidden2, adj.shape[0]) optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw) # Construct the nodes for doing random walk. Doing it before since the seed is fixed nodes_in_G = list(G.nodes()) chunks = len(nodes_in_G) // args.number_walks random.Random().shuffle(nodes_in_G) hidden_emb = None for epoch in tqdm(range(args.epochs)): t = time.time() model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) # After back-propagating gae loss, now do the deepWalk regularization if args.dw == 1: sg.train() if args.full_number_walks > 0: walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(SEED)) else: walks = build_deepwalk_corpus_iter( G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(SEED), chunk=epoch % chunks, nodes=nodes_in_G) for walk in walks: if args.context == 1: # Construct the pairs for predicting context node # for each node, treated as center word curr_pair = (int(walk[center_node_pos]), []) for center_node_pos in range(len(walk)): # for each window position for w in range(-args.window_size, args.window_size + 1): context_node_pos = center_node_pos + w # make soure not jump out sentence if context_node_pos < 0 or context_node_pos >= len( walk ) or center_node_pos == context_node_pos: continue context_node_idx = walk[context_node_pos] curr_pair[1].append(int(context_node_idx)) else: # first item in the walk is the starting node curr_pair = (int(walk[0]), [ int(context_node_idx) for context_node_idx in walk[1:] ]) if args.ns == 1: neg_nodes = [] pos_nodes = set(walk) while len(neg_nodes) < args.walk_length - 1: rand_node = random.randint(0, n_nodes - 1) if rand_node not in pos_nodes: neg_nodes.append(rand_node) neg_nodes = torch.from_numpy(np.array(neg_nodes)).long() # Do actual prediction src_node = torch.from_numpy(np.array([curr_pair[0]])).long() tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long() optimizer_dw.zero_grad() log_pos = sg(src_node, tgt_nodes, neg_sample=False) if args.ns == 1: loss_neg = sg(src_node, neg_nodes, neg_sample=True) loss_dw = log_pos + loss_neg else: loss_dw = log_pos loss_dw.backward(retain_graph=True) cur_dw_loss = loss_dw.item() optimizer_dw.step() loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) if args.dw == 1: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}" .format(epoch + 1, cur_loss, cur_dw_loss, ap_curr, time.time() - t)) else: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) if (epoch + 1) % 10 == 0: tqdm.write("Evaluating intermediate results...") kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) predict_labels = kmeans.predict(hidden_emb) cm = clustering_metrics(true_labels, predict_labels) cm.evaluationClusterModelFromLabel(tqdm) roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score)) np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) predict_labels = kmeans.predict(hidden_emb) cm = clustering_metrics(true_labels, predict_labels) cm.evaluationClusterModelFromLabel(tqdm) if args.plot == 1: cm.plotClusters(tqdm, hidden_emb, true_labels)
def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features = load_corpus(args.dataset_str) # n_nodes, feat_dim = features.shape # print(n_nodes, feat_dim) print(type(features)) print(adj) # print(adj[0], adj[1]) features = sp.identity(features.shape[0]) # featureless # print(adj.shape) # print(features.shape) # Some preprocessing features = preprocess_features(features) adj_norm = preprocess_adj(adj) num_supports = 1 # model_func = GCN adj_norm = torch.FloatTensor(adj_norm.toarray()) features = torch.FloatTensor(features.toarray()) n_nodes, feat_dim = features.shape print(n_nodes, feat_dim) print(type(features)) print(type(adj_norm)) print(features.shape) print(adj_norm.shape) # n_nodes, feat_dim = features.shape # print(n_nodes, feat_dim) # Store original adjacency matrix (without diagonal entries) for later # adj_orig = adj # adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # adj_orig.eliminate_zeros() # modified/added by hollis # adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) # Remove diagonal elements # adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) # adj.eliminate_zeros() # Check that diag is zero: # assert np.diag(adj.todense()).sum() == 0 # adj_train = sp.csr_matrix(adj) # adj_train = adj_train + adj_train.T # Some preprocessing # adj_norm = normalize_adj(adj) # adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.FloatTensor(adj_label.toarray()) # adj_label = np.array(adj_label, dtype=float) # adj_label = torch.FloatTensor(adj_label) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # added by hollis # pos_weight = torch.from_numpy(np.array(pos_weight)) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) model = GCNModelVAE(feat_dim, args.hidden1, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) hidden_emb = None for epoch in range(args.epochs): print("in epoch") t = time.time() model.train() optimizer.zero_grad() # recovered, mu, logvar = model(features, adj_norm) print("before model") recovered, mu, logvar = model(features, adj_norm) print("before loss") loss = loss_function(preds=recovered, labels=adj_norm, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm) #loss = loss_function(preds=recovered, labels=adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) print("befor backword") loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() hidden_emb = np.array(hidden_emb) if epoch == 1: fni = "./result/emb_init.txt" hidden_emb = np.array(hidden_emb) np.savetxt(fni, hidden_emb) if epoch == args.epochs - 1: fnf = "./result/emb.txt" hidden_emb = np.array(hidden_emb) np.savetxt(fnf, hidden_emb) #roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) print( "Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), # "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!")
def gae_for(args, position): print("Using {} dataset".format(args.dataset_str)) #qhashes, chashes = load_hashes() Q, X = load_data() prebuild = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/GEM_wDis_prebuild.bin" Q_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_lw_query_feats.npy" #"/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy" X_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_index.npy" D_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_Dis.npy" adj, features, adj_Q, features_Q = load_from_prebuild(prebuild, Q_features, X_features, D_features, k=5) # ----> 1M #cut_size = 800000 #adj = adj[:cut_size, :cut_size] #adj_Q = adj_Q[:, :cut_size] #features = features[:cut_size] #Q = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy").T.astype(np.float32) #X = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_index_fused.npy").T.astype(np.float32) #D = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/revisitop1m/revisitDistractors_fused_3s_cq.npy").T.astype(np.float32) #X = np.concatenate((X.T,D.T)).T # load the distractor too, shape should be (2048, 1M) #adj, features = gen_graph_index(Q, X, k=5, k_qe=3, do_qe=False) #-----> 5k #adj_Q, features_Q = gen_graph(Q, X, k=5, k_qe=3, do_qe=False) #generate validation/revop evaluation the same way as training ----> 5k features_all = np.concatenate([features_Q, features]) features_all = torch.from_numpy(features_all) #adj_Q = adj_Q.todense() #adj_all = np.concatenate([adj_Q, adj.todense()]) #adj_all = np.pad(adj_all, [[0,0], [Q.shape[1], 0]], "constant") adj_all = sp.vstack((adj_Q, adj)) zeros = sp.csr_matrix((adj_all.shape[0], Q.shape[1])) adj_all = sp.hstack((zeros, adj_all)) adj_all = sp.csr_matrix(adj_all) rows, columns = adj_all.nonzero() print("Making Symmetry") for i in range(rows.shape[0]): if rows[i] < Q.shape[1]: adj_all[columns[i], rows[i]] = adj_all[rows[i], columns[i]] else: break #adj_all = sp.csr_matrix(adj_all) print("preprocessing adj_all") adj_all_norm = preprocess_graph(adj_all) #adj = add_neighbours_neighbour(adj) #adj1, features1 = load_data(args.dataset_str) features = torch.from_numpy(features) #features_all = torch.from_numpy(features_all) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj = adj_orig #adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) print("Sampling validation") adj_train, adj_val, features, features_valid = mask_test_rows( adj, features) adj = adj_train # Some preprocessing print("preprocessing adj") adj_norm = preprocess_graph(adj) #adj_norm_label = preprocess_graph_sp(adj) adj_label = adj_train + sp.eye( adj_train.shape[0] ) #adj_norm_label + sp.eye(adj_train.shape[0]) #adj_train + sp.eye(adj_train.shape[0]) #rows, columns = adj_label.nonzero() #adj_label[columns, rows] = adj_label[rows, columns] # adj_label = sparse_to_tuple(adj_label) #adj_label = torch.FloatTensor(adj_label.toarray()) print("adj sum: " + str(adj.sum())) pos_weight = float(float(adj.shape[0]) * adj.shape[0] - adj.sum()) / adj.sum() print("top part: " + str(float(float(adj.shape[0]) * adj.shape[0] - adj.sum()))) print("pos wieght: " + str(pos_weight)) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # for validation data processing: zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0])) adj_train_ext = sp.hstack((zero, adj_train)) adj_evaluate = sp.vstack((adj_val, adj_train_ext)) adj_evaluate = sp.csr_matrix(adj_evaluate) rows, columns = adj_evaluate.nonzero() val_edges = [] val_edges_false = [] pos = {} print("getting positive edges") all_val = [i for i in range(len(rows)) if rows[i] < adj_val.shape[0]] for i in all_val: sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "]") val_edges.append((rows[i], columns[i])) if rows[i] not in pos: pos[rows[i]] = [] pos[rows[i]].append(columns[i]) #for i in range(rows.shape[0]): # sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "/" + str(adj_val.shape[0]) + "]") # sys.stdout.flush() # if rows[i] < adj_val.shape[0]: # val_edges.append((rows[i], columns[i])) # if rows[i] not in pos: # pos[rows[i]] = [] # pos[rows[i]].append(columns[i]) # adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]] # else: # break step = 0 neg_per_pos = 100 #for r in pos: # p = pos[r] #neg_edges = Parallel(n_jobs=40)(delayed(neg_sample)(pos[i], adj_val.shape[1], neg_per_pos, i) for i in pos) #val_edges_false = [(i, item) for i in range(len(neg_edges)) for item in neg_edges[i]] #a = np.random.permutation(adj_val.shape[1]) #a = [i for i in a if i not in p] #a = a[:100] #for i in a: # val_edges_false.append((r, i)) ##count = 0 ##i = 0 #sys.stdout.write("\r sampling neg edges for validtion: [" + str(step) + "/" + str(len(pos)) + "]") #sys.stdout.flush() #step += 1 #while count < 100: # if a[i] not in p: # val_edges_false.append((r, a[i])) # count += 1 # i += 1 print("preprocessing adj_evaluate") adj_evaluate_norm = preprocess_graph(adj_evaluate) #adj_evaluate_norm_label = preprocess_graph_sp(adj_evaluate) adj_label_evaluate = adj_evaluate + sp.eye( adj_evaluate.shape[0] ) #adj_evaluate_norm_label+ sp.eye(adj_evaluate.shape[0]) #adj_evaluate + sp.eye(adj_evaluate.shape[0]) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) #sparse_mx_to_torch_sparse_tensor(adj_label_evaluate) features_evaluate = np.concatenate([features, features_valid]) features_evaluate = torch.from_numpy(features_evaluate) # validation done if mode == "VAE": model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) adj_label = torch.FloatTensor(adj_label.toarray()) adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) elif mode == "AE": model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout) #adj_label = torch.FloatTensor(adj_label.toarray()) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) elif mode == "VAE_batch": model = GCNModelVAE_batch(feat_dim, args.hidden1, args.hidden2, args.dropout) elif mode == "AE_batch": model = GCNModelAE_batch(feat_dim, args.hidden1, args.hidden2, args.dropout).cuda() #model = torch.nn.DataParallel(model) #model = model.cuda() # train_dataset = GAEDataset(adj_norm, adj_label, features) # train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, # shuffle=True, num_workers=8, pin_memory=True) train_ids = torch.tensor(range(features.shape[0]), dtype=torch.long) train_dataset = TensorDataset(train_ids) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) optimizer = optim.Adam(model.parameters(), lr=args.lr) #optimizer = pSGLD(model.parameters(), lr=args.lr) #optimizer = optim.RMSprop(model.parameters(), lr=args.lr) hidden_emb = None pos_weight = torch.from_numpy(np.array(0.0, dtype=np.float32)) #ipdb.set_trace() t = time.time() best = 0 best_epoch = 0 best_val_cost = 99999 best_val_epoch = 0 best_val_epoch_revop = 0 prev_loss = 0 prev_val_loss = 99999 best_val_roc = 0 best_val_ap = 0 best_val_roc_revop = 0 best_val_ap_revop = 0 torch.set_num_threads(20) print("NUM THREADS USED") print(torch.get_num_threads()) for epoch in range(args.epochs): #t = time.time() model.train() lossVal = 0 lossValNorm = 0 backtime = time.time() for batchID, (inds) in enumerate(train_loader): z = model(features, adj_norm) inds = inds[0] adj = F.relu(torch.mm(z[inds], z[inds].t())) preds = adj label_batch = torch.FloatTensor(adj_label[inds, :][:, inds].toarray()) cost = norm * F.binary_cross_entropy_with_logits( preds, label_batch, pos_weight=pos_weight) lossVal += cost.item() lossValNorm += 1 optimizer.zero_grad() cost.backward(retain_graph=True) # if batchID == 0: # cost.backward(retain_graph=True) # else: # cost.backward() optimizer.step() if batchID >= 10: break backtime_done = time.time() - backtime sys.stdout.write("\r time taken to do epoch: " + str(backtime_done) + " opt: " + str(time.time() - backtime) + "\n") sys.stdout.flush() #optimizer.step() # sample rows only: non-square #for i in range(0, d.shape[0], sample_size): # selection = random_perm[i:i+sample_size] # to_keep = selection # sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :])) # sample_adj = adj[to_keep, :] # sample_features = features # recovered, mu, logvar = model(sample_features, sample_adj_norm) # loss = loss_function(preds=recovered, labels=sample_adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # loss.backward() # cur_loss = loss.item() # optimizer.step() # sys.stdout.write("\r ") # sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0])) # sys.stdout.flush() # sample rows + take their postiives and add to rows (make it square) #for i in range(0, d.shape[0], sample_size): # selection = random_perm[i:i+sample_size] # to_keep = np.nonzero(d[selection, :]) # # (array([0, 1, 2, 2]), array([0, 1, 0, 1])) # the_set = set(list(to_keep[0]) + list(to_keep[1])) # temp = set(list(to_keep[0])) # to_keep = list(the_set - column_exclude) # # column_exclude.union(temp) # # these ar ethe rows and columns that we need ne select # sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :][:,to_keep])) # sample_features = features[to_keep, :] # sample_adj_label = adj_label[to_keep, :][:,to_keep] # #print(samplei_adj_norm.shape) # #print(sample_features.shape) # #print(sample_adj_label.shape) # #print(sample.shape) # sample_adj = adj[to_keep, :][:,to_keep] # pos_weight = float(sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) / sample_adj.sum() # pos_weight = torch.from_numpy(np.array(pos_weight)) # norm = sample_adj.shape[0] * sample_adj.shape[0] / float((sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) * 2) # n_nodes, feat_dim = sample_features.shape # if mode == "VAE": # recovered, mu, logvar = model(sample_features, sample_adj_norm) # #recovered = recovered[i:i+500] # #sample_adj_label = sample_adj_label[i:i+500] # loss = loss_function(preds=recovered, labels=sample_adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # elif mode == "AE": # recovered = model(sample_features, sample_adj_norm) # loss = loss_function_ae(preds=recovered, labels=sample_adj_label, # norm=norm, pos_weight=pos_weight) # loss.backward() # cur_loss = loss.item() # optimizer.step() # sys.stdout.write("\r ") # sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0]) + "]....size of sample=" + str(len(sample_features))) # sys.stdout.flush() sys.stdout.write( "\r \r" ) if (epoch + 1) % 1 == 0: model.eval() #adj_dense = adj_train.todense() #adj_val_dense = adj_val.todense() #adj_train_ext = np.pad(adj_dense, [[0,0], [adj_val_dense.shape[0], 0]], "constant") #adj_evaluate = np.concatenate([adj_val_dense, adj_train_ext]) #zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0])) #adj_train_ext = sp.hstack((zero, adj_train)) #adj_evaluate = sp.vstack((adj_val, adj_train_ext)) ##zeros = sp.csr_matrix((adj_evaluate.shape[0], adj_val.shape[1])) ##adj_evaluate = sp.hstack((zeros, adj_evaluate)) #adj_evaluate = sp.csr_matrix(adj_evaluate) #rows, columns = adj_evaluate.nonzero() #for i in range(rows.shape[0]): # if rows[i] < adj_val.shape[1]: # adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]] # else: # break #adj_evaluate_norm = preprocess_graph(adj_evaluate) #adj_label_evaluate = adj_evaluate + sp.eye(adj_evaluate.shape[0]) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) ##adj_label_evaluate = sparse_to_tuple(adj_label_evaluate) #features_evaluate = np.concatenate([features, features_valid]) #features_evaluate = torch.from_numpy(features_evaluate) just_adj_evaluate = sparse_mx_to_torch_sparse_tensor(adj_evaluate) #recovered, mu, logvar = model(features_evaluate, just_adj_evaluate.coalesce().indices(), just_adj_evaluate.coalesce().values()) #recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm) #val_loss = loss_function(preds=recovered, labels=adj_label_evaluate, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # if mode == "VAE": # recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm) # val_loss = loss_function(preds=recovered, labels=adj_label_evaluate, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # elif mode == "AE": # mu = model(features_evaluate, adj_evaluate_norm) # val_loss = loss_function_ae(preds=recovered, labels=adj_label_evaluate, # norm=norm, pos_weight=pos_weight) # elif mode == "VAE_batch": # recovered, mu, logvar, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False) # elif mode == "AE_batch": # recovered, mu, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False) # val_emb = mu.data.numpy() #roc_curr, ap_curr = get_roc_score(val_emb, val_edges, val_edges_false) # do one q at a time #revop_map = eval_each_q(model, adj_all, features_all, Q.shape[1]) # hack by appending stuff on top of adj if mode == "VAE": _, mu, _ = model(features_all, adj_all_norm) elif mode == "AE": mu = model(features_all, adj_all_norm) elif mode == "VAE_batch": mu = model(features_all, adj_all_norm, None, None, None, None, None, just_mu=True, training=False) elif mode == "AE_batch": mu = model(features_all, adj_all_norm, None, None, None, None, None, just_mu=True, training=False) hidden_emb = mu.data.numpy() ## get validation loss #recovered, mu, logvar = model(features, adj_norm) #val_loss = loss_function(preds=recovered, labels=adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) revop_map = get_roc_score_matrix(hidden_emb, Q.shape[1]) if best <= revop_map: emb = hidden_emb Q_end = Q.shape[1] best = revop_map best_epoch = epoch + 1 # write it into a file and do egt on that #embQ = emb[:Q_end,:].T #embX = emb[Q_end:,:].T #np.save("/media/jason/28c9eee1-312e-47d0-88ce-572813ebd6f1/graph/gae-pytorch/best_embedding2.npy",hidden_emb) #concat = np.concatenate((embQ.T,embX.T)) #revop_inner_prod = np.matmul(concat, concat.T) #revop_preds = np.argsort(-revop_inner_prod,axis=0) #if revop_map > 54: # f = open("best_result.txt", "w") # for i in range(revop_preds.shape[1]): # if i < Q_end: # f.write(qhashes[i] + ",") # else: # f.write(chashes[i - Q_end] + ",") # for j in revop_preds[:,i]: # if j < Q_end: # f.write(qhashes[j] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ") # else: # f.write(chashes[j - Q_end] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ") # f.write("\n") # f.flush() # #for j in range() # f.close() if best_val_cost > -99.0: #prev_val_loss - val_loss > 0 and prev_val_loss - val_loss > prev_loss - cur_loss and best_val_cost > val_loss: best_val_cost = -99.0 best_val_epoch = epoch + 1 best_val_epoch_revop = revop_map #if best_val_roc < roc_curr: # best_val_roc = roc_curr # best_val_roc_revop = revop_map #if best_val_ap < ap_curr: # best_val_ap = ap_curr # best_val_ap_revop = revop_map print( "Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(lossVal / lossValNorm), "val_loss=", "{:.5f}".format(-99.0), #"val_roc_curr=", "{:.5f}".format(roc_curr), #"val_ap_curr=", "{:.5f}".format(ap_curr), "revop=", "{:.5f}".format(revop_map), "best_revop=", "{:.5f}".format(best), "revop_at_best_val=", "{:.5f}".format(best_val_epoch_revop), #"revop_at_best_val_roc=", "{:.5f}".format(best_val_roc_revop), #"revop_at_best_ap_roc=", "{:.5f}".format(best_val_ap_revop), "time=", "{:.5f}".format(time.time() - t)) prev_val_loss = -99.0 prev_loss = -99.0 t = time.time() print("Optimization Finished!") #roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) #print('Test ROC score: ' + str(roc_score)) #print('Test AP score: ' + str(ap_score)) return best, best_val_epoch_revop, best_val_roc_revop, best_val_ap_revop
def GAEembedding(z, adj, args): ''' GAE embedding for clustering Param: z,adj Return: Embedding from graph ''' # true_labels = np.asarray(true_labels) # args.model = 'gcn_vae' # args.dw = 0 # args.epochs = 200 # args.hidden1 = 32 # args.hidden2 = 16 # args.lr = 0.01 # args.dropout = 0. # args.dataset_sr = 'cora' # args.walk_length = 5 # args.window_size = 3 # args.number_walks = 5 # args.full_number_walks =0 # args.lr_dw = 0.001 # args.context = 0 # args.ns = 1 # args.n_clusters = 11 # args.plot = 0 # featrues from z # Louvain features = z # features = torch.DoubleTensor(features) features = torch.FloatTensor(features) # Old implementation # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Before proceeding further, make the structure for doing deepWalk # if args.dw == 1: # print('Using deepWalk regularization...') # G = load_edgelist_from_csr_matrix(adj_orig, undirected=True) # print("Number of nodes: {}".format(len(G.nodes()))) # num_walks = len(G.nodes()) * args.number_walks # print("Number of walks: {}".format(num_walks)) # data_size = num_walks * args.walk_length # print("Data size (walks*length): {}".format(data_size)) # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.DoubleTensor(adj_label.toarray()) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.GAEmodel == 'gcn_vae': model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) else: model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) if args.precisionModel == 'Double': model = model.double() optimizer = optim.Adam(model.parameters(), lr=args.GAElr) # if args.dw == 1: # sg = SkipGram(args.hidden2, adj.shape[0]) # optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw) # # Construct the nodes for doing random walk. Doing it before since the seed is fixed # nodes_in_G = list(G.nodes()) # chunks = len(nodes_in_G) // args.number_walks # random.Random().shuffle(nodes_in_G) hidden_emb = None for epoch in tqdm(range(args.GAEepochs)): t = time.time() # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print('Mem consumption before training: '+str(mem)) model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) # After back-propagating gae loss, now do the deepWalk regularization # if args.dw == 1: # sg.train() # if args.full_number_walks > 0: # walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks, # path_length=args.walk_length, alpha=0, # rand=random.Random(SEED)) # else: # walks = build_deepwalk_corpus_iter(G, num_paths=args.number_walks, # path_length=args.walk_length, alpha=0, # rand=random.Random(SEED), # chunk=epoch % chunks, # nodes=nodes_in_G) # for walk in walks: # if args.context == 1: # # Construct the pairs for predicting context node # # for each node, treated as center word # curr_pair = (int(walk[center_node_pos]), []) # for center_node_pos in range(len(walk)): # # for each window position # for w in range(-args.window_size, args.window_size + 1): # context_node_pos = center_node_pos + w # # make soure not jump out sentence # if context_node_pos < 0 or context_node_pos >= len(walk) or center_node_pos == context_node_pos: # continue # context_node_idx = walk[context_node_pos] # curr_pair[1].append(int(context_node_idx)) # else: # # first item in the walk is the starting node # curr_pair = (int(walk[0]), [int(context_node_idx) for context_node_idx in walk[1:]]) # if args.ns == 1: # neg_nodes = [] # pos_nodes = set(walk) # while len(neg_nodes) < args.walk_length - 1: # rand_node = random.randint(0, n_nodes - 1) # if rand_node not in pos_nodes: # neg_nodes.append(rand_node) # neg_nodes = torch.from_numpy(np.array(neg_nodes)).long() # # Do actual prediction # src_node = torch.from_numpy(np.array([curr_pair[0]])).long() # tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long() # optimizer_dw.zero_grad() # log_pos = sg(src_node, tgt_nodes, neg_sample=False) # if args.ns == 1: # loss_neg = sg(src_node, neg_nodes, neg_sample=True) # loss_dw = log_pos + loss_neg # else: # loss_dw = log_pos # loss_dw.backward(retain_graph=True) # cur_dw_loss = loss_dw.item() # optimizer_dw.step() loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() # TODO, this is prediction # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) ap_curr = 0 # if args.dw == 1: # tqdm.write("Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}".format( # epoch + 1, cur_loss, cur_dw_loss, # ap_curr, time.time() - t)) # else: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) # if (epoch + 1) % 10 == 0: # tqdm.write("Evaluating intermediate results...") # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) # predict_labels = kmeans.predict(hidden_emb) # cm = clustering_metrics(true_labels, predict_labels) # cm.evaluationClusterModelFromLabel(tqdm) # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) # tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score)) # np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) # predict_labels = kmeans.predict(hidden_emb) # cm = clustering_metrics(true_labels, predict_labels) # cm.evaluationClusterModelFromLabel(tqdm) # if args.GAEplot == 1: # cm.plotClusters(tqdm, hidden_emb, true_labels) return hidden_emb
def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) lst_result = [] for i in range(10): model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) hidden_emb = None max_roc_ap = 0 for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() recovered, mu, logvar = model(features, adj_norm) loss = loss_function(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) roc_ap = roc_curr + ap_curr if max_roc_ap < roc_ap: max_roc_ap = roc_ap h_emb_best_model = hidden_emb print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) print("---------------------------------------") print("Optimization Finished!: ", i) roc_score, ap_score = get_roc_score(h_emb_best_model, adj_orig, test_edges, test_edges_false) # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) lst_result.append([i, roc_score, ap_score]) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) lst_result = np.array(lst_result) csv_info = np.append( lst_result, [["mean", np.mean(lst_result[:, 1]), np.mean(lst_result[:, 2])]], axis=0) csv_info = np.append( csv_info, [["std", np.std(lst_result[:, 1]), np.std(lst_result[:, 2])]], axis=0) t = int(time.time()) folder = Path(os.path.join(os.getcwd(), "csv")) csv_name = "{}_{}_{}_{}_{}.csv".format(args.dataset_str, args.epochs, args.hidden1, args.hidden2, t) df = pd.DataFrame(csv_info, columns=['run', 'ROC', "AP"]) df.to_csv(os.path.join(folder, csv_name))
def GAEembedding(z, adj, args): ''' GAE embedding for clustering Param: z,adj Return: Embedding from graph ''' # featrues from z # Louvain features = z # features = torch.DoubleTensor(features) features = torch.FloatTensor(features) # Old implementation # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.DoubleTensor(adj_label.toarray()) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.GAEmodel == 'gcn_vae': model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) else: model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) if args.precisionModel == 'Double': model = model.double() optimizer = optim.Adam(model.parameters(), lr=args.GAElr) hidden_emb = None for epoch in tqdm(range(args.GAEepochs)): t = time.time() # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print('Mem consumption before training: '+str(mem)) model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() # TODO, this is prediction # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) ap_curr = 0 tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) return hidden_emb