def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = torch.Tensor( [float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()]) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) hidden_emb = None for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() recovered, mu, logvar = model(features, adj_norm) loss = loss_function(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score))
def eval_each_q(model, adj_all, features_all, q_length): adj_q = adj_all[:q_length, q_length:] adj_i = adj_all[q_length:, q_length:] features_q = features_all[:q_length, :] features_i = features_all[q_length:, :] rankings = [] for i in range(q_length): adj_all_evaluation = sp.vstack((adj_q[i, :], adj_i)) zeros = sp.csr_matrix((adj_all_evaluation.shape[0], 1)) adj_all_evaluation = sp.hstack((zeros, adj_all_evaluation)) adj_all_evaluation = sp.csr_matrix(adj_all_evaluation) features_all_evaluation = np.concatenate([features_q[i:i+1, :], features_i]) features_all_evaluation = torch.from_numpy(features_all_evaluation) rows, columns = adj_all_evaluation.nonzero() for j in range(rows.shape[0]): if rows[j] < 1: adj_all_evaluation[columns[j], rows[j]] = adj_all_evaluation[rows[j], columns[j]] else: break adj_all_evaluation = preprocess_graph(adj_all_evaluation) if mode == "VAE": _, mu, _ = model(features_all_evaluation, adj_all_evaluation) elif mode == "AE": _, mu = model(features_all_evaluation, adj_all_evaluation) elif mode == "VAE_batch": mu = model(features_all_evaluation, adj_all_evaluation, None, None, None, None, None, just_mu=True, training=False) elif mode == "AE_batch": mu = model(features_all_evaluation, adj_all_evaluation, None, None, None, None, None, just_mu=True, training=False) emb = mu.data.numpy() embX = emb[1:, :].T embQ = emb[:1, :].T revop_inner_prod = np.matmul(embX.T,embQ) revop_preds = np.argsort(-revop_inner_prod,axis=0) rankings.append(revop_preds) sys.stdout.write("\r" + "evaluating queries: [" + str(i) + "/" + str(q_length) + "]") sys.stdout.flush() rankings = np.array(rankings) rankings = np.reshape(rankings, (rankings.shape[0], rankings.shape[1])) rankings = rankings.T revop_map = eval_revop(rankings,silent=True) return revop_map
def run_main(args): # Define parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 dim_dnn_in = dim_au_out dim_dnn_out = 1 select_drug = args.drug na = args.missing_value data_path = args.data_path label_path = args.label_path test_size = args.test_size valid_size = args.valid_size g_disperson = args.var_genes_disp model_path = args.source_model_path encoder_path = args.encoder_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.encoder_h_dims.split(",") preditor_hdims = args.predictor_h_dims.split(",") reduce_model = args.dimreduce prediction = args.predition sampling = args.sampling PCA_dim = args.PCA_dim encoder_hdims = list(map(int, encoder_hdims)) preditor_hdims = list(map(int, preditor_hdims)) load_model = bool(args.load_source_model) preditor_path = model_path + reduce_model + args.predictor + prediction + select_drug + '.pkl' # Read data data_r = pd.read_csv(data_path, index_col=0) label_r = pd.read_csv(label_path, index_col=0) label_r = label_r.fillna(na) now = time.strftime("%Y-%m-%d-%H-%M-%S") ut.save_arguments(args, now) # Initialize logging and std out out_path = log_path + now + ".err" log_path = log_path + now + ".log" out = open(out_path, "w") sys.stderr = out logging.basicConfig( level=logging.INFO, #控制台打印的日志级别 filename=log_path, filemode='a', ##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志 #a是追加模式,默认如果不写的话,就是追加模式 format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' #日志格式 ) logging.getLogger('matplotlib.font_manager').disabled = True logging.info(args) # data = data_r # Filter out na values selected_idx = label_r.loc[:, select_drug] != na if (g_disperson != None): hvg, adata = ut.highly_variable_genes(data_r, min_disp=g_disperson) # Rename columns if duplication exist data_r.columns = adata.var_names # Extract hvgs data = data_r.loc[selected_idx, hvg] else: data = data_r.loc[selected_idx, :] # Do PCA if PCA_dim!=0 if PCA_dim != 0: data = PCA(n_components=PCA_dim).fit_transform(data) else: data = data # Extract labels label = label_r.loc[selected_idx, select_drug] # Scaling data mmscaler = preprocessing.MinMaxScaler() lbscaler = preprocessing.MinMaxScaler() data = mmscaler.fit_transform(data) label = label.values.reshape(-1, 1) if prediction == "regression": label = lbscaler.fit_transform(label) dim_model_out = 1 else: le = LabelEncoder() label = le.fit_transform(label) dim_model_out = 2 #label = label.values.reshape(-1,1) logging.info(np.std(data)) logging.info(np.mean(data)) # Split traning valid test set X_train_all, X_test, Y_train_all, Y_test = train_test_split( data, label, test_size=test_size, random_state=42) X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_all, Y_train_all, test_size=valid_size, random_state=42) # sampling method if sampling == None: X_train, Y_train = sam.nosampling(X_train, Y_train) logging.info("nosampling") elif sampling == "upsampling": X_train, Y_train = sam.upsampling(X_train, Y_train) logging.info("upsampling") elif sampling == "downsampling": X_train, Y_train = sam.downsampling(X_train, Y_train) logging.info("downsampling") elif sampling == "SMOTE": X_train, Y_train = sam.SMOTEsampling(X_train, Y_train) logging.info("SMOTE") else: logging.info("not a legal sampling method") logging.info(data.shape) logging.info(label.shape) #logging.info(X_train.shape, Y_train.shape) #logging.info(X_test.shape, Y_test.shape) logging.info(X_train.max()) logging.info(X_train.min()) # Select the Training device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: logging.info(device) torch.cuda.set_device(device) # Construct datasets and data loaders X_trainTensor = torch.FloatTensor(X_train).to(device) X_validTensor = torch.FloatTensor(X_valid).to(device) X_testTensor = torch.FloatTensor(X_test).to(device) X_allTensor = torch.FloatTensor(data).to(device) if prediction == "regression": Y_trainTensor = torch.FloatTensor(Y_train).to(device) Y_trainallTensor = torch.FloatTensor(Y_train_all).to(device) Y_validTensor = torch.FloatTensor(Y_valid).to(device) else: Y_trainTensor = torch.LongTensor(Y_train).to(device) Y_trainallTensor = torch.LongTensor(Y_train_all).to(device) Y_validTensor = torch.LongTensor(Y_valid).to(device) train_dataset = TensorDataset(X_trainTensor, X_trainTensor) valid_dataset = TensorDataset(X_validTensor, X_validTensor) test_dataset = TensorDataset(X_testTensor, X_testTensor) all_dataset = TensorDataset(X_allTensor, X_allTensor) X_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) X_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) X_allDataLoader = DataLoader(dataset=all_dataset, batch_size=batch_size, shuffle=True) # construct TensorDataset trainreducedDataset = TensorDataset(X_trainTensor, Y_trainTensor) validreducedDataset = TensorDataset(X_validTensor, Y_validTensor) trainDataLoader_p = DataLoader(dataset=trainreducedDataset, batch_size=batch_size, shuffle=True) validDataLoader_p = DataLoader(dataset=validreducedDataset, batch_size=batch_size, shuffle=True) dataloaders_train = {'train': trainDataLoader_p, 'val': validDataLoader_p} if (bool(args.pretrain) != False): dataloaders_pretrain = { 'train': X_trainDataLoader, 'val': X_validDataLoader } if reduce_model == "VAE": encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) else: encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) if torch.cuda.is_available(): encoder.cuda() logging.info(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) if reduce_model == "AE": encoder, loss_report_en = t.train_AE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, loss_function=loss_function_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=encoder_path) elif reduce_model == "VAE": encoder, loss_report_en = t.train_VAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=encoder_path) logging.info("Pretrained finished") # Train model of predictor if args.predictor == "DNN": if reduce_model == "AE": model = PretrainedPredictor(input_dim=X_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=encoder_path, freezed=bool(args.freeze_pretrain)) elif reduce_model == "VAE": model = PretrainedVAEPredictor( input_dim=X_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=encoder_path, freezed=bool(args.freeze_pretrain), z_reparam=bool(args.VAErepram)) elif args.predictor == "GCN": if reduce_model == "VAE": gcn_encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) else: gcn_encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) gcn_encoder.load_state_dict(torch.load(args.GCNreduce_path)) gcn_encoder.to(device) train_embeddings = gcn_encoder.encode(X_trainTensor) zOut_tr = train_embeddings.cpu().detach().numpy() valid_embeddings = gcn_encoder.encode(X_validTensor) zOut_va = valid_embeddings.cpu().detach().numpy() test_embeddings = gcn_encoder.encode(X_testTensor) zOut_te = test_embeddings.cpu().detach().numpy() adj_tr, edgeList_tr = g.generateAdj( zOut_tr, graphType='KNNgraphStatsSingleThread', para='euclidean' + ':' + str('10'), adjTag=True) adj_va, edgeList_va = g.generateAdj( zOut_va, graphType='KNNgraphStatsSingleThread', para='euclidean' + ':' + str('10'), adjTag=True) adj_te, edgeList_te = g.generateAdj( zOut_te, graphType='KNNgraphStatsSingleThread', para='euclidean' + ':' + str('10'), adjTag=True) Adj_trainTensor = preprocess_graph(adj_tr) Adj_validTensor = preprocess_graph(adj_va) Adj_testTensor = preprocess_graph(adj_te) Z_trainTensor = torch.FloatTensor(zOut_tr).to(device) Z_validTensor = torch.FloatTensor(zOut_va).to(device) Z_testTensor = torch.FloatTensor(zOut_te).to(device) if (args.binarizied == 0): zDiscret_tr = zOut_tr > np.mean(zOut_tr, axis=0) zDiscret_tr = 1.0 * zDiscret_tr zDiscret_va = zOut_va > np.mean(zOut_va, axis=0) zDiscret_va = 1.0 * zDiscret_va zDiscret_te = zOut_te > np.mean(zOut_te, axis=0) zDiscret_te = 1.0 * zDiscret_te Z_trainTensor = torch.FloatTensor(zDiscret_tr).to(device) Z_validTensor = torch.FloatTensor(zDiscret_va).to(device) Z_testTensor = torch.FloatTensor(zDiscret_te).to(device) ZTensors_train = {'train': Z_trainTensor, 'val': Z_validTensor} XTensors_train = {'train': X_trainTensor, 'val': X_validTensor} YTensors_train = {'train': Y_trainTensor, 'val': Y_validTensor} AdjTensors_train = {'train': Adj_trainTensor, 'val': Adj_validTensor} if (args.GCNfeature == "x"): dim_GCNin = X_allTensor.shape[1] GCN_trainTensors = XTensors_train GCN_testTensor = X_testTensor else: dim_GCNin = Z_testTensor.shape[1] GCN_trainTensors = ZTensors_train GCN_testTensor = Z_testTensor model = GCNPredictor(input_feat_dim=dim_GCNin, hidden_dim1=encoder_hdims[0], hidden_dim2=dim_au_out, dropout=0.5, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=encoder_path, freezed=bool(args.freeze_pretrain)) # model2 = GAEBase(input_dim=X_train_all.shape[1], latent_dim=128,h_dims=[512]) # model2.to(device) # test = model2((X_trainTensor,Adj_trainTensor)) logging.info(model) if torch.cuda.is_available(): model.cuda() model.to(device) # Define optimizer optimizer = optim.Adam(model.parameters(), lr=1e-2) if prediction == "regression": loss_function = nn.MSELoss() else: loss_function = nn.CrossEntropyLoss() exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer) if args.predictor == "GCN": model, report = t.train_GCNpreditor_model(model=model, z=GCN_trainTensors, y=YTensors_train, adj=AdjTensors_train, optimizer=optimizer, loss_function=loss_function, n_epochs=epochs, scheduler=exp_lr_scheduler, save_path=preditor_path) else: model, report = t.train_predictor_model(model, dataloaders_train, optimizer, loss_function, epochs, exp_lr_scheduler, load=load_model, save_path=preditor_path) if args.predictor != 'GCN': dl_result = model(X_testTensor).detach().cpu().numpy() else: dl_result = model(GCN_testTensor, Adj_testTensor).detach().cpu().numpy() #torch.save(model.feature_extractor.state_dict(), preditor_path+"encoder.pkl") logging.info('Performances: R/Pearson/Mse/') if prediction == "regression": logging.info(r2_score(dl_result, Y_test)) logging.info(pearsonr(dl_result.flatten(), Y_test.flatten())) logging.info(mean_squared_error(dl_result, Y_test)) else: lb_results = np.argmax(dl_result, axis=1) #pb_results = np.max(dl_result,axis=1) pb_results = dl_result[:, 1] report_dict = classification_report(Y_test, lb_results, output_dict=True) report_df = pd.DataFrame(report_dict).T ap_score = average_precision_score(Y_test, pb_results) auroc_score = roc_auc_score(Y_test, pb_results) report_df['auroc_score'] = auroc_score report_df['ap_score'] = ap_score report_df.to_csv("saved/logs/" + reduce_model + args.predictor + prediction + select_drug + now + '_report.csv') logging.info(classification_report(Y_test, lb_results)) logging.info(average_precision_score(Y_test, pb_results)) logging.info(roc_auc_score(Y_test, pb_results)) model = DummyClassifier(strategy='stratified') model.fit(X_train, Y_train) yhat = model.predict_proba(X_test) naive_probs = yhat[:, 1] ut.plot_roc_curve(Y_test, naive_probs, pb_results, title=str(roc_auc_score(Y_test, pb_results)), path="saved/figures/" + reduce_model + args.predictor + prediction + select_drug + now + '_roc.pdf') ut.plot_pr_curve(Y_test, pb_results, title=average_precision_score(Y_test, pb_results), path="saved/figures/" + reduce_model + args.predictor + prediction + select_drug + now + '_prc.pdf')
def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features, y_test, tx, ty, test_maks, true_labels = load_data( args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Before proceeding further, make the structure for doing deepWalk if args.dw == 1: print('Using deepWalk regularization...') G = load_edgelist_from_csr_matrix(adj_orig, undirected=True) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.model == 'gcn_vae': model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) else: model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.dw == 1: sg = SkipGram(args.hidden2, adj.shape[0]) optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw) # Construct the nodes for doing random walk. Doing it before since the seed is fixed nodes_in_G = list(G.nodes()) chunks = len(nodes_in_G) // args.number_walks random.Random().shuffle(nodes_in_G) hidden_emb = None for epoch in tqdm(range(args.epochs)): t = time.time() model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) # After back-propagating gae loss, now do the deepWalk regularization if args.dw == 1: sg.train() if args.full_number_walks > 0: walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(SEED)) else: walks = build_deepwalk_corpus_iter( G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(SEED), chunk=epoch % chunks, nodes=nodes_in_G) for walk in walks: if args.context == 1: # Construct the pairs for predicting context node # for each node, treated as center word curr_pair = (int(walk[center_node_pos]), []) for center_node_pos in range(len(walk)): # for each window position for w in range(-args.window_size, args.window_size + 1): context_node_pos = center_node_pos + w # make soure not jump out sentence if context_node_pos < 0 or context_node_pos >= len( walk ) or center_node_pos == context_node_pos: continue context_node_idx = walk[context_node_pos] curr_pair[1].append(int(context_node_idx)) else: # first item in the walk is the starting node curr_pair = (int(walk[0]), [ int(context_node_idx) for context_node_idx in walk[1:] ]) if args.ns == 1: neg_nodes = [] pos_nodes = set(walk) while len(neg_nodes) < args.walk_length - 1: rand_node = random.randint(0, n_nodes - 1) if rand_node not in pos_nodes: neg_nodes.append(rand_node) neg_nodes = torch.from_numpy(np.array(neg_nodes)).long() # Do actual prediction src_node = torch.from_numpy(np.array([curr_pair[0]])).long() tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long() optimizer_dw.zero_grad() log_pos = sg(src_node, tgt_nodes, neg_sample=False) if args.ns == 1: loss_neg = sg(src_node, neg_nodes, neg_sample=True) loss_dw = log_pos + loss_neg else: loss_dw = log_pos loss_dw.backward(retain_graph=True) cur_dw_loss = loss_dw.item() optimizer_dw.step() loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) if args.dw == 1: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}" .format(epoch + 1, cur_loss, cur_dw_loss, ap_curr, time.time() - t)) else: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) if (epoch + 1) % 10 == 0: tqdm.write("Evaluating intermediate results...") kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) predict_labels = kmeans.predict(hidden_emb) cm = clustering_metrics(true_labels, predict_labels) cm.evaluationClusterModelFromLabel(tqdm) roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score)) np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) predict_labels = kmeans.predict(hidden_emb) cm = clustering_metrics(true_labels, predict_labels) cm.evaluationClusterModelFromLabel(tqdm) if args.plot == 1: cm.plotClusters(tqdm, hidden_emb, true_labels)
def gae_for(args, position): print("Using {} dataset".format(args.dataset_str)) #qhashes, chashes = load_hashes() Q, X = load_data() prebuild = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/GEM_wDis_prebuild.bin" Q_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_lw_query_feats.npy" #"/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy" X_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_index.npy" D_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_Dis.npy" adj, features, adj_Q, features_Q = load_from_prebuild(prebuild, Q_features, X_features, D_features, k=5) # ----> 1M #cut_size = 800000 #adj = adj[:cut_size, :cut_size] #adj_Q = adj_Q[:, :cut_size] #features = features[:cut_size] #Q = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy").T.astype(np.float32) #X = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_index_fused.npy").T.astype(np.float32) #D = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/revisitop1m/revisitDistractors_fused_3s_cq.npy").T.astype(np.float32) #X = np.concatenate((X.T,D.T)).T # load the distractor too, shape should be (2048, 1M) #adj, features = gen_graph_index(Q, X, k=5, k_qe=3, do_qe=False) #-----> 5k #adj_Q, features_Q = gen_graph(Q, X, k=5, k_qe=3, do_qe=False) #generate validation/revop evaluation the same way as training ----> 5k features_all = np.concatenate([features_Q, features]) features_all = torch.from_numpy(features_all) #adj_Q = adj_Q.todense() #adj_all = np.concatenate([adj_Q, adj.todense()]) #adj_all = np.pad(adj_all, [[0,0], [Q.shape[1], 0]], "constant") adj_all = sp.vstack((adj_Q, adj)) zeros = sp.csr_matrix((adj_all.shape[0], Q.shape[1])) adj_all = sp.hstack((zeros, adj_all)) adj_all = sp.csr_matrix(adj_all) rows, columns = adj_all.nonzero() print("Making Symmetry") for i in range(rows.shape[0]): if rows[i] < Q.shape[1]: adj_all[columns[i], rows[i]] = adj_all[rows[i], columns[i]] else: break #adj_all = sp.csr_matrix(adj_all) print("preprocessing adj_all") adj_all_norm = preprocess_graph(adj_all) #adj = add_neighbours_neighbour(adj) #adj1, features1 = load_data(args.dataset_str) features = torch.from_numpy(features) #features_all = torch.from_numpy(features_all) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj = adj_orig #adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) print("Sampling validation") adj_train, adj_val, features, features_valid = mask_test_rows( adj, features) adj = adj_train # Some preprocessing print("preprocessing adj") adj_norm = preprocess_graph(adj) #adj_norm_label = preprocess_graph_sp(adj) adj_label = adj_train + sp.eye( adj_train.shape[0] ) #adj_norm_label + sp.eye(adj_train.shape[0]) #adj_train + sp.eye(adj_train.shape[0]) #rows, columns = adj_label.nonzero() #adj_label[columns, rows] = adj_label[rows, columns] # adj_label = sparse_to_tuple(adj_label) #adj_label = torch.FloatTensor(adj_label.toarray()) print("adj sum: " + str(adj.sum())) pos_weight = float(float(adj.shape[0]) * adj.shape[0] - adj.sum()) / adj.sum() print("top part: " + str(float(float(adj.shape[0]) * adj.shape[0] - adj.sum()))) print("pos wieght: " + str(pos_weight)) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # for validation data processing: zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0])) adj_train_ext = sp.hstack((zero, adj_train)) adj_evaluate = sp.vstack((adj_val, adj_train_ext)) adj_evaluate = sp.csr_matrix(adj_evaluate) rows, columns = adj_evaluate.nonzero() val_edges = [] val_edges_false = [] pos = {} print("getting positive edges") all_val = [i for i in range(len(rows)) if rows[i] < adj_val.shape[0]] for i in all_val: sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "]") val_edges.append((rows[i], columns[i])) if rows[i] not in pos: pos[rows[i]] = [] pos[rows[i]].append(columns[i]) #for i in range(rows.shape[0]): # sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "/" + str(adj_val.shape[0]) + "]") # sys.stdout.flush() # if rows[i] < adj_val.shape[0]: # val_edges.append((rows[i], columns[i])) # if rows[i] not in pos: # pos[rows[i]] = [] # pos[rows[i]].append(columns[i]) # adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]] # else: # break step = 0 neg_per_pos = 100 #for r in pos: # p = pos[r] #neg_edges = Parallel(n_jobs=40)(delayed(neg_sample)(pos[i], adj_val.shape[1], neg_per_pos, i) for i in pos) #val_edges_false = [(i, item) for i in range(len(neg_edges)) for item in neg_edges[i]] #a = np.random.permutation(adj_val.shape[1]) #a = [i for i in a if i not in p] #a = a[:100] #for i in a: # val_edges_false.append((r, i)) ##count = 0 ##i = 0 #sys.stdout.write("\r sampling neg edges for validtion: [" + str(step) + "/" + str(len(pos)) + "]") #sys.stdout.flush() #step += 1 #while count < 100: # if a[i] not in p: # val_edges_false.append((r, a[i])) # count += 1 # i += 1 print("preprocessing adj_evaluate") adj_evaluate_norm = preprocess_graph(adj_evaluate) #adj_evaluate_norm_label = preprocess_graph_sp(adj_evaluate) adj_label_evaluate = adj_evaluate + sp.eye( adj_evaluate.shape[0] ) #adj_evaluate_norm_label+ sp.eye(adj_evaluate.shape[0]) #adj_evaluate + sp.eye(adj_evaluate.shape[0]) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) #sparse_mx_to_torch_sparse_tensor(adj_label_evaluate) features_evaluate = np.concatenate([features, features_valid]) features_evaluate = torch.from_numpy(features_evaluate) # validation done if mode == "VAE": model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) adj_label = torch.FloatTensor(adj_label.toarray()) adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) elif mode == "AE": model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout) #adj_label = torch.FloatTensor(adj_label.toarray()) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) elif mode == "VAE_batch": model = GCNModelVAE_batch(feat_dim, args.hidden1, args.hidden2, args.dropout) elif mode == "AE_batch": model = GCNModelAE_batch(feat_dim, args.hidden1, args.hidden2, args.dropout).cuda() #model = torch.nn.DataParallel(model) #model = model.cuda() # train_dataset = GAEDataset(adj_norm, adj_label, features) # train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, # shuffle=True, num_workers=8, pin_memory=True) train_ids = torch.tensor(range(features.shape[0]), dtype=torch.long) train_dataset = TensorDataset(train_ids) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) optimizer = optim.Adam(model.parameters(), lr=args.lr) #optimizer = pSGLD(model.parameters(), lr=args.lr) #optimizer = optim.RMSprop(model.parameters(), lr=args.lr) hidden_emb = None pos_weight = torch.from_numpy(np.array(0.0, dtype=np.float32)) #ipdb.set_trace() t = time.time() best = 0 best_epoch = 0 best_val_cost = 99999 best_val_epoch = 0 best_val_epoch_revop = 0 prev_loss = 0 prev_val_loss = 99999 best_val_roc = 0 best_val_ap = 0 best_val_roc_revop = 0 best_val_ap_revop = 0 torch.set_num_threads(20) print("NUM THREADS USED") print(torch.get_num_threads()) for epoch in range(args.epochs): #t = time.time() model.train() lossVal = 0 lossValNorm = 0 backtime = time.time() for batchID, (inds) in enumerate(train_loader): z = model(features, adj_norm) inds = inds[0] adj = F.relu(torch.mm(z[inds], z[inds].t())) preds = adj label_batch = torch.FloatTensor(adj_label[inds, :][:, inds].toarray()) cost = norm * F.binary_cross_entropy_with_logits( preds, label_batch, pos_weight=pos_weight) lossVal += cost.item() lossValNorm += 1 optimizer.zero_grad() cost.backward(retain_graph=True) # if batchID == 0: # cost.backward(retain_graph=True) # else: # cost.backward() optimizer.step() if batchID >= 10: break backtime_done = time.time() - backtime sys.stdout.write("\r time taken to do epoch: " + str(backtime_done) + " opt: " + str(time.time() - backtime) + "\n") sys.stdout.flush() #optimizer.step() # sample rows only: non-square #for i in range(0, d.shape[0], sample_size): # selection = random_perm[i:i+sample_size] # to_keep = selection # sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :])) # sample_adj = adj[to_keep, :] # sample_features = features # recovered, mu, logvar = model(sample_features, sample_adj_norm) # loss = loss_function(preds=recovered, labels=sample_adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # loss.backward() # cur_loss = loss.item() # optimizer.step() # sys.stdout.write("\r ") # sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0])) # sys.stdout.flush() # sample rows + take their postiives and add to rows (make it square) #for i in range(0, d.shape[0], sample_size): # selection = random_perm[i:i+sample_size] # to_keep = np.nonzero(d[selection, :]) # # (array([0, 1, 2, 2]), array([0, 1, 0, 1])) # the_set = set(list(to_keep[0]) + list(to_keep[1])) # temp = set(list(to_keep[0])) # to_keep = list(the_set - column_exclude) # # column_exclude.union(temp) # # these ar ethe rows and columns that we need ne select # sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :][:,to_keep])) # sample_features = features[to_keep, :] # sample_adj_label = adj_label[to_keep, :][:,to_keep] # #print(samplei_adj_norm.shape) # #print(sample_features.shape) # #print(sample_adj_label.shape) # #print(sample.shape) # sample_adj = adj[to_keep, :][:,to_keep] # pos_weight = float(sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) / sample_adj.sum() # pos_weight = torch.from_numpy(np.array(pos_weight)) # norm = sample_adj.shape[0] * sample_adj.shape[0] / float((sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) * 2) # n_nodes, feat_dim = sample_features.shape # if mode == "VAE": # recovered, mu, logvar = model(sample_features, sample_adj_norm) # #recovered = recovered[i:i+500] # #sample_adj_label = sample_adj_label[i:i+500] # loss = loss_function(preds=recovered, labels=sample_adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # elif mode == "AE": # recovered = model(sample_features, sample_adj_norm) # loss = loss_function_ae(preds=recovered, labels=sample_adj_label, # norm=norm, pos_weight=pos_weight) # loss.backward() # cur_loss = loss.item() # optimizer.step() # sys.stdout.write("\r ") # sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0]) + "]....size of sample=" + str(len(sample_features))) # sys.stdout.flush() sys.stdout.write( "\r \r" ) if (epoch + 1) % 1 == 0: model.eval() #adj_dense = adj_train.todense() #adj_val_dense = adj_val.todense() #adj_train_ext = np.pad(adj_dense, [[0,0], [adj_val_dense.shape[0], 0]], "constant") #adj_evaluate = np.concatenate([adj_val_dense, adj_train_ext]) #zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0])) #adj_train_ext = sp.hstack((zero, adj_train)) #adj_evaluate = sp.vstack((adj_val, adj_train_ext)) ##zeros = sp.csr_matrix((adj_evaluate.shape[0], adj_val.shape[1])) ##adj_evaluate = sp.hstack((zeros, adj_evaluate)) #adj_evaluate = sp.csr_matrix(adj_evaluate) #rows, columns = adj_evaluate.nonzero() #for i in range(rows.shape[0]): # if rows[i] < adj_val.shape[1]: # adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]] # else: # break #adj_evaluate_norm = preprocess_graph(adj_evaluate) #adj_label_evaluate = adj_evaluate + sp.eye(adj_evaluate.shape[0]) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) ##adj_label_evaluate = sparse_to_tuple(adj_label_evaluate) #features_evaluate = np.concatenate([features, features_valid]) #features_evaluate = torch.from_numpy(features_evaluate) just_adj_evaluate = sparse_mx_to_torch_sparse_tensor(adj_evaluate) #recovered, mu, logvar = model(features_evaluate, just_adj_evaluate.coalesce().indices(), just_adj_evaluate.coalesce().values()) #recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm) #val_loss = loss_function(preds=recovered, labels=adj_label_evaluate, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # if mode == "VAE": # recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm) # val_loss = loss_function(preds=recovered, labels=adj_label_evaluate, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # elif mode == "AE": # mu = model(features_evaluate, adj_evaluate_norm) # val_loss = loss_function_ae(preds=recovered, labels=adj_label_evaluate, # norm=norm, pos_weight=pos_weight) # elif mode == "VAE_batch": # recovered, mu, logvar, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False) # elif mode == "AE_batch": # recovered, mu, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False) # val_emb = mu.data.numpy() #roc_curr, ap_curr = get_roc_score(val_emb, val_edges, val_edges_false) # do one q at a time #revop_map = eval_each_q(model, adj_all, features_all, Q.shape[1]) # hack by appending stuff on top of adj if mode == "VAE": _, mu, _ = model(features_all, adj_all_norm) elif mode == "AE": mu = model(features_all, adj_all_norm) elif mode == "VAE_batch": mu = model(features_all, adj_all_norm, None, None, None, None, None, just_mu=True, training=False) elif mode == "AE_batch": mu = model(features_all, adj_all_norm, None, None, None, None, None, just_mu=True, training=False) hidden_emb = mu.data.numpy() ## get validation loss #recovered, mu, logvar = model(features, adj_norm) #val_loss = loss_function(preds=recovered, labels=adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) revop_map = get_roc_score_matrix(hidden_emb, Q.shape[1]) if best <= revop_map: emb = hidden_emb Q_end = Q.shape[1] best = revop_map best_epoch = epoch + 1 # write it into a file and do egt on that #embQ = emb[:Q_end,:].T #embX = emb[Q_end:,:].T #np.save("/media/jason/28c9eee1-312e-47d0-88ce-572813ebd6f1/graph/gae-pytorch/best_embedding2.npy",hidden_emb) #concat = np.concatenate((embQ.T,embX.T)) #revop_inner_prod = np.matmul(concat, concat.T) #revop_preds = np.argsort(-revop_inner_prod,axis=0) #if revop_map > 54: # f = open("best_result.txt", "w") # for i in range(revop_preds.shape[1]): # if i < Q_end: # f.write(qhashes[i] + ",") # else: # f.write(chashes[i - Q_end] + ",") # for j in revop_preds[:,i]: # if j < Q_end: # f.write(qhashes[j] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ") # else: # f.write(chashes[j - Q_end] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ") # f.write("\n") # f.flush() # #for j in range() # f.close() if best_val_cost > -99.0: #prev_val_loss - val_loss > 0 and prev_val_loss - val_loss > prev_loss - cur_loss and best_val_cost > val_loss: best_val_cost = -99.0 best_val_epoch = epoch + 1 best_val_epoch_revop = revop_map #if best_val_roc < roc_curr: # best_val_roc = roc_curr # best_val_roc_revop = revop_map #if best_val_ap < ap_curr: # best_val_ap = ap_curr # best_val_ap_revop = revop_map print( "Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(lossVal / lossValNorm), "val_loss=", "{:.5f}".format(-99.0), #"val_roc_curr=", "{:.5f}".format(roc_curr), #"val_ap_curr=", "{:.5f}".format(ap_curr), "revop=", "{:.5f}".format(revop_map), "best_revop=", "{:.5f}".format(best), "revop_at_best_val=", "{:.5f}".format(best_val_epoch_revop), #"revop_at_best_val_roc=", "{:.5f}".format(best_val_roc_revop), #"revop_at_best_ap_roc=", "{:.5f}".format(best_val_ap_revop), "time=", "{:.5f}".format(time.time() - t)) prev_val_loss = -99.0 prev_loss = -99.0 t = time.time() print("Optimization Finished!") #roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) #print('Test ROC score: ' + str(roc_score)) #print('Test AP score: ' + str(ap_score)) return best, best_val_epoch_revop, best_val_roc_revop, best_val_ap_revop
def GAEembedding(z, adj, args): ''' GAE embedding for clustering Param: z,adj Return: Embedding from graph ''' # true_labels = np.asarray(true_labels) # args.model = 'gcn_vae' # args.dw = 0 # args.epochs = 200 # args.hidden1 = 32 # args.hidden2 = 16 # args.lr = 0.01 # args.dropout = 0. # args.dataset_sr = 'cora' # args.walk_length = 5 # args.window_size = 3 # args.number_walks = 5 # args.full_number_walks =0 # args.lr_dw = 0.001 # args.context = 0 # args.ns = 1 # args.n_clusters = 11 # args.plot = 0 # featrues from z # Louvain features = z # features = torch.DoubleTensor(features) features = torch.FloatTensor(features) # Old implementation # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Before proceeding further, make the structure for doing deepWalk # if args.dw == 1: # print('Using deepWalk regularization...') # G = load_edgelist_from_csr_matrix(adj_orig, undirected=True) # print("Number of nodes: {}".format(len(G.nodes()))) # num_walks = len(G.nodes()) * args.number_walks # print("Number of walks: {}".format(num_walks)) # data_size = num_walks * args.walk_length # print("Data size (walks*length): {}".format(data_size)) # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.DoubleTensor(adj_label.toarray()) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.GAEmodel == 'gcn_vae': model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) else: model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) if args.precisionModel == 'Double': model = model.double() optimizer = optim.Adam(model.parameters(), lr=args.GAElr) # if args.dw == 1: # sg = SkipGram(args.hidden2, adj.shape[0]) # optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw) # # Construct the nodes for doing random walk. Doing it before since the seed is fixed # nodes_in_G = list(G.nodes()) # chunks = len(nodes_in_G) // args.number_walks # random.Random().shuffle(nodes_in_G) hidden_emb = None for epoch in tqdm(range(args.GAEepochs)): t = time.time() # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print('Mem consumption before training: '+str(mem)) model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) # After back-propagating gae loss, now do the deepWalk regularization # if args.dw == 1: # sg.train() # if args.full_number_walks > 0: # walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks, # path_length=args.walk_length, alpha=0, # rand=random.Random(SEED)) # else: # walks = build_deepwalk_corpus_iter(G, num_paths=args.number_walks, # path_length=args.walk_length, alpha=0, # rand=random.Random(SEED), # chunk=epoch % chunks, # nodes=nodes_in_G) # for walk in walks: # if args.context == 1: # # Construct the pairs for predicting context node # # for each node, treated as center word # curr_pair = (int(walk[center_node_pos]), []) # for center_node_pos in range(len(walk)): # # for each window position # for w in range(-args.window_size, args.window_size + 1): # context_node_pos = center_node_pos + w # # make soure not jump out sentence # if context_node_pos < 0 or context_node_pos >= len(walk) or center_node_pos == context_node_pos: # continue # context_node_idx = walk[context_node_pos] # curr_pair[1].append(int(context_node_idx)) # else: # # first item in the walk is the starting node # curr_pair = (int(walk[0]), [int(context_node_idx) for context_node_idx in walk[1:]]) # if args.ns == 1: # neg_nodes = [] # pos_nodes = set(walk) # while len(neg_nodes) < args.walk_length - 1: # rand_node = random.randint(0, n_nodes - 1) # if rand_node not in pos_nodes: # neg_nodes.append(rand_node) # neg_nodes = torch.from_numpy(np.array(neg_nodes)).long() # # Do actual prediction # src_node = torch.from_numpy(np.array([curr_pair[0]])).long() # tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long() # optimizer_dw.zero_grad() # log_pos = sg(src_node, tgt_nodes, neg_sample=False) # if args.ns == 1: # loss_neg = sg(src_node, neg_nodes, neg_sample=True) # loss_dw = log_pos + loss_neg # else: # loss_dw = log_pos # loss_dw.backward(retain_graph=True) # cur_dw_loss = loss_dw.item() # optimizer_dw.step() loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() # TODO, this is prediction # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) ap_curr = 0 # if args.dw == 1: # tqdm.write("Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}".format( # epoch + 1, cur_loss, cur_dw_loss, # ap_curr, time.time() - t)) # else: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) # if (epoch + 1) % 10 == 0: # tqdm.write("Evaluating intermediate results...") # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) # predict_labels = kmeans.predict(hidden_emb) # cm = clustering_metrics(true_labels, predict_labels) # cm.evaluationClusterModelFromLabel(tqdm) # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) # tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score)) # np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) # predict_labels = kmeans.predict(hidden_emb) # cm = clustering_metrics(true_labels, predict_labels) # cm.evaluationClusterModelFromLabel(tqdm) # if args.GAEplot == 1: # cm.plotClusters(tqdm, hidden_emb, true_labels) return hidden_emb
def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) lst_result = [] for i in range(10): model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) hidden_emb = None max_roc_ap = 0 for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() recovered, mu, logvar = model(features, adj_norm) loss = loss_function(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) roc_ap = roc_curr + ap_curr if max_roc_ap < roc_ap: max_roc_ap = roc_ap h_emb_best_model = hidden_emb print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) print("---------------------------------------") print("Optimization Finished!: ", i) roc_score, ap_score = get_roc_score(h_emb_best_model, adj_orig, test_edges, test_edges_false) # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) lst_result.append([i, roc_score, ap_score]) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) lst_result = np.array(lst_result) csv_info = np.append( lst_result, [["mean", np.mean(lst_result[:, 1]), np.mean(lst_result[:, 2])]], axis=0) csv_info = np.append( csv_info, [["std", np.std(lst_result[:, 1]), np.std(lst_result[:, 2])]], axis=0) t = int(time.time()) folder = Path(os.path.join(os.getcwd(), "csv")) csv_name = "{}_{}_{}_{}_{}.csv".format(args.dataset_str, args.epochs, args.hidden1, args.hidden2, t) df = pd.DataFrame(csv_info, columns=['run', 'ROC', "AP"]) df.to_csv(os.path.join(folder, csv_name))
def GAEembedding(z, adj, args): ''' GAE embedding for clustering Param: z,adj Return: Embedding from graph ''' # featrues from z # Louvain features = z # features = torch.DoubleTensor(features) features = torch.FloatTensor(features) # Old implementation # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.DoubleTensor(adj_label.toarray()) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.GAEmodel == 'gcn_vae': model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) else: model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) if args.precisionModel == 'Double': model = model.double() optimizer = optim.Adam(model.parameters(), lr=args.GAElr) hidden_emb = None for epoch in tqdm(range(args.GAEepochs)): t = time.time() # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print('Mem consumption before training: '+str(mem)) model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() # TODO, this is prediction # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) ap_curr = 0 tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) return hidden_emb