Exemple #1
0
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features = load_data(args.dataset_str)
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = torch.Tensor(
        [float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    hidden_emb = None
    for epoch in range(args.epochs):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        recovered, mu, logvar = model(features, adj_norm)
        loss = loss_function(preds=recovered,
                             labels=adj_label,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm,
                             pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges,
                                          val_edges_false)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(ap_curr),
              "time=", "{:.5f}".format(time.time() - t))

    print("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges,
                                        test_edges_false)
    print('Test ROC score: ' + str(roc_score))
    print('Test AP score: ' + str(ap_score))
Exemple #2
0
def eval_each_q(model, adj_all, features_all, q_length):
    adj_q = adj_all[:q_length, q_length:]
    adj_i = adj_all[q_length:, q_length:]
    features_q = features_all[:q_length, :]
    features_i = features_all[q_length:, :]
    rankings = []
    for i in range(q_length):
        adj_all_evaluation = sp.vstack((adj_q[i, :], adj_i))
        zeros = sp.csr_matrix((adj_all_evaluation.shape[0], 1))
        adj_all_evaluation = sp.hstack((zeros, adj_all_evaluation))
        adj_all_evaluation = sp.csr_matrix(adj_all_evaluation)
        features_all_evaluation = np.concatenate([features_q[i:i+1, :], features_i])
        features_all_evaluation = torch.from_numpy(features_all_evaluation)

        rows, columns = adj_all_evaluation.nonzero()
        for j in range(rows.shape[0]):
            if rows[j] < 1:
                adj_all_evaluation[columns[j], rows[j]] = adj_all_evaluation[rows[j], columns[j]]
            else:
                break
        adj_all_evaluation = preprocess_graph(adj_all_evaluation)

        if mode == "VAE":
            _, mu, _ = model(features_all_evaluation, adj_all_evaluation)
        elif mode == "AE":
            _, mu = model(features_all_evaluation, adj_all_evaluation)
        elif mode == "VAE_batch":
            mu = model(features_all_evaluation, adj_all_evaluation, None, None, None, None, None, just_mu=True, training=False)
        elif mode == "AE_batch":
            mu = model(features_all_evaluation, adj_all_evaluation, None, None, None, None, None, just_mu=True, training=False)
        emb = mu.data.numpy()
        embX = emb[1:, :].T
        embQ = emb[:1, :].T
        revop_inner_prod = np.matmul(embX.T,embQ)
        revop_preds = np.argsort(-revop_inner_prod,axis=0)
        rankings.append(revop_preds)
        sys.stdout.write("\r" + "evaluating queries: [" + str(i) + "/" + str(q_length) + "]")
        sys.stdout.flush()
    rankings = np.array(rankings)
    rankings = np.reshape(rankings, (rankings.shape[0], rankings.shape[1]))
    rankings = rankings.T
    revop_map = eval_revop(rankings,silent=True)
    return revop_map
Exemple #3
0
def run_main(args):

    # Define parameters
    epochs = args.epochs
    dim_au_out = args.bottleneck  #8, 16, 32, 64, 128, 256,512
    dim_dnn_in = dim_au_out
    dim_dnn_out = 1
    select_drug = args.drug
    na = args.missing_value
    data_path = args.data_path
    label_path = args.label_path
    test_size = args.test_size
    valid_size = args.valid_size
    g_disperson = args.var_genes_disp
    model_path = args.source_model_path
    encoder_path = args.encoder_path
    log_path = args.logging_file
    batch_size = args.batch_size
    encoder_hdims = args.encoder_h_dims.split(",")
    preditor_hdims = args.predictor_h_dims.split(",")
    reduce_model = args.dimreduce
    prediction = args.predition
    sampling = args.sampling
    PCA_dim = args.PCA_dim

    encoder_hdims = list(map(int, encoder_hdims))
    preditor_hdims = list(map(int, preditor_hdims))
    load_model = bool(args.load_source_model)

    preditor_path = model_path + reduce_model + args.predictor + prediction + select_drug + '.pkl'

    # Read data
    data_r = pd.read_csv(data_path, index_col=0)
    label_r = pd.read_csv(label_path, index_col=0)
    label_r = label_r.fillna(na)

    now = time.strftime("%Y-%m-%d-%H-%M-%S")

    ut.save_arguments(args, now)

    # Initialize logging and std out
    out_path = log_path + now + ".err"
    log_path = log_path + now + ".log"

    out = open(out_path, "w")
    sys.stderr = out

    logging.basicConfig(
        level=logging.INFO,  #控制台打印的日志级别
        filename=log_path,
        filemode='a',  ##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志
        #a是追加模式,默认如果不写的话,就是追加模式
        format=
        '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
        #日志格式
    )
    logging.getLogger('matplotlib.font_manager').disabled = True

    logging.info(args)

    # data = data_r

    # Filter out na values
    selected_idx = label_r.loc[:, select_drug] != na

    if (g_disperson != None):
        hvg, adata = ut.highly_variable_genes(data_r, min_disp=g_disperson)
        # Rename columns if duplication exist
        data_r.columns = adata.var_names
        # Extract hvgs
        data = data_r.loc[selected_idx, hvg]
    else:
        data = data_r.loc[selected_idx, :]

    # Do PCA if PCA_dim!=0
    if PCA_dim != 0:
        data = PCA(n_components=PCA_dim).fit_transform(data)
    else:
        data = data

    # Extract labels
    label = label_r.loc[selected_idx, select_drug]

    # Scaling data
    mmscaler = preprocessing.MinMaxScaler()
    lbscaler = preprocessing.MinMaxScaler()

    data = mmscaler.fit_transform(data)
    label = label.values.reshape(-1, 1)

    if prediction == "regression":
        label = lbscaler.fit_transform(label)
        dim_model_out = 1
    else:
        le = LabelEncoder()
        label = le.fit_transform(label)
        dim_model_out = 2

    #label = label.values.reshape(-1,1)

    logging.info(np.std(data))
    logging.info(np.mean(data))

    # Split traning valid test set
    X_train_all, X_test, Y_train_all, Y_test = train_test_split(
        data, label, test_size=test_size, random_state=42)
    X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_all,
                                                          Y_train_all,
                                                          test_size=valid_size,
                                                          random_state=42)
    # sampling method
    if sampling == None:
        X_train, Y_train = sam.nosampling(X_train, Y_train)
        logging.info("nosampling")
    elif sampling == "upsampling":
        X_train, Y_train = sam.upsampling(X_train, Y_train)
        logging.info("upsampling")
    elif sampling == "downsampling":
        X_train, Y_train = sam.downsampling(X_train, Y_train)
        logging.info("downsampling")
    elif sampling == "SMOTE":
        X_train, Y_train = sam.SMOTEsampling(X_train, Y_train)
        logging.info("SMOTE")
    else:
        logging.info("not a legal sampling method")

    logging.info(data.shape)
    logging.info(label.shape)
    #logging.info(X_train.shape, Y_train.shape)
    #logging.info(X_test.shape, Y_test.shape)
    logging.info(X_train.max())
    logging.info(X_train.min())

    # Select the Training device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Assuming that we are on a CUDA machine, this should print a CUDA device:
    logging.info(device)
    torch.cuda.set_device(device)

    # Construct datasets and data loaders
    X_trainTensor = torch.FloatTensor(X_train).to(device)
    X_validTensor = torch.FloatTensor(X_valid).to(device)
    X_testTensor = torch.FloatTensor(X_test).to(device)
    X_allTensor = torch.FloatTensor(data).to(device)

    if prediction == "regression":
        Y_trainTensor = torch.FloatTensor(Y_train).to(device)
        Y_trainallTensor = torch.FloatTensor(Y_train_all).to(device)
        Y_validTensor = torch.FloatTensor(Y_valid).to(device)
    else:
        Y_trainTensor = torch.LongTensor(Y_train).to(device)
        Y_trainallTensor = torch.LongTensor(Y_train_all).to(device)
        Y_validTensor = torch.LongTensor(Y_valid).to(device)

    train_dataset = TensorDataset(X_trainTensor, X_trainTensor)
    valid_dataset = TensorDataset(X_validTensor, X_validTensor)
    test_dataset = TensorDataset(X_testTensor, X_testTensor)
    all_dataset = TensorDataset(X_allTensor, X_allTensor)

    X_trainDataLoader = DataLoader(dataset=train_dataset,
                                   batch_size=batch_size,
                                   shuffle=True)
    X_validDataLoader = DataLoader(dataset=valid_dataset,
                                   batch_size=batch_size,
                                   shuffle=True)
    X_allDataLoader = DataLoader(dataset=all_dataset,
                                 batch_size=batch_size,
                                 shuffle=True)

    # construct TensorDataset
    trainreducedDataset = TensorDataset(X_trainTensor, Y_trainTensor)
    validreducedDataset = TensorDataset(X_validTensor, Y_validTensor)

    trainDataLoader_p = DataLoader(dataset=trainreducedDataset,
                                   batch_size=batch_size,
                                   shuffle=True)
    validDataLoader_p = DataLoader(dataset=validreducedDataset,
                                   batch_size=batch_size,
                                   shuffle=True)

    dataloaders_train = {'train': trainDataLoader_p, 'val': validDataLoader_p}

    if (bool(args.pretrain) != False):
        dataloaders_pretrain = {
            'train': X_trainDataLoader,
            'val': X_validDataLoader
        }
        if reduce_model == "VAE":
            encoder = VAEBase(input_dim=data.shape[1],
                              latent_dim=dim_au_out,
                              h_dims=encoder_hdims)
        else:
            encoder = AEBase(input_dim=data.shape[1],
                             latent_dim=dim_au_out,
                             h_dims=encoder_hdims)

        if torch.cuda.is_available():
            encoder.cuda()

        logging.info(encoder)
        encoder.to(device)

        optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2)
        loss_function_e = nn.MSELoss()
        exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e)

        if reduce_model == "AE":
            encoder, loss_report_en = t.train_AE_model(
                net=encoder,
                data_loaders=dataloaders_pretrain,
                optimizer=optimizer_e,
                loss_function=loss_function_e,
                n_epochs=epochs,
                scheduler=exp_lr_scheduler_e,
                save_path=encoder_path)
        elif reduce_model == "VAE":
            encoder, loss_report_en = t.train_VAE_model(
                net=encoder,
                data_loaders=dataloaders_pretrain,
                optimizer=optimizer_e,
                n_epochs=epochs,
                scheduler=exp_lr_scheduler_e,
                save_path=encoder_path)

        logging.info("Pretrained finished")

    # Train model of predictor

    if args.predictor == "DNN":
        if reduce_model == "AE":
            model = PretrainedPredictor(input_dim=X_train.shape[1],
                                        latent_dim=dim_au_out,
                                        h_dims=encoder_hdims,
                                        hidden_dims_predictor=preditor_hdims,
                                        output_dim=dim_model_out,
                                        pretrained_weights=encoder_path,
                                        freezed=bool(args.freeze_pretrain))
        elif reduce_model == "VAE":
            model = PretrainedVAEPredictor(
                input_dim=X_train.shape[1],
                latent_dim=dim_au_out,
                h_dims=encoder_hdims,
                hidden_dims_predictor=preditor_hdims,
                output_dim=dim_model_out,
                pretrained_weights=encoder_path,
                freezed=bool(args.freeze_pretrain),
                z_reparam=bool(args.VAErepram))

    elif args.predictor == "GCN":

        if reduce_model == "VAE":
            gcn_encoder = VAEBase(input_dim=data.shape[1],
                                  latent_dim=dim_au_out,
                                  h_dims=encoder_hdims)
        else:
            gcn_encoder = AEBase(input_dim=data.shape[1],
                                 latent_dim=dim_au_out,
                                 h_dims=encoder_hdims)

        gcn_encoder.load_state_dict(torch.load(args.GCNreduce_path))
        gcn_encoder.to(device)

        train_embeddings = gcn_encoder.encode(X_trainTensor)
        zOut_tr = train_embeddings.cpu().detach().numpy()
        valid_embeddings = gcn_encoder.encode(X_validTensor)
        zOut_va = valid_embeddings.cpu().detach().numpy()
        test_embeddings = gcn_encoder.encode(X_testTensor)
        zOut_te = test_embeddings.cpu().detach().numpy()

        adj_tr, edgeList_tr = g.generateAdj(
            zOut_tr,
            graphType='KNNgraphStatsSingleThread',
            para='euclidean' + ':' + str('10'),
            adjTag=True)
        adj_va, edgeList_va = g.generateAdj(
            zOut_va,
            graphType='KNNgraphStatsSingleThread',
            para='euclidean' + ':' + str('10'),
            adjTag=True)
        adj_te, edgeList_te = g.generateAdj(
            zOut_te,
            graphType='KNNgraphStatsSingleThread',
            para='euclidean' + ':' + str('10'),
            adjTag=True)

        Adj_trainTensor = preprocess_graph(adj_tr)
        Adj_validTensor = preprocess_graph(adj_va)
        Adj_testTensor = preprocess_graph(adj_te)

        Z_trainTensor = torch.FloatTensor(zOut_tr).to(device)
        Z_validTensor = torch.FloatTensor(zOut_va).to(device)
        Z_testTensor = torch.FloatTensor(zOut_te).to(device)

        if (args.binarizied == 0):
            zDiscret_tr = zOut_tr > np.mean(zOut_tr, axis=0)
            zDiscret_tr = 1.0 * zDiscret_tr
            zDiscret_va = zOut_va > np.mean(zOut_va, axis=0)
            zDiscret_va = 1.0 * zDiscret_va
            zDiscret_te = zOut_te > np.mean(zOut_te, axis=0)
            zDiscret_te = 1.0 * zDiscret_te

            Z_trainTensor = torch.FloatTensor(zDiscret_tr).to(device)
            Z_validTensor = torch.FloatTensor(zDiscret_va).to(device)
            Z_testTensor = torch.FloatTensor(zDiscret_te).to(device)

        ZTensors_train = {'train': Z_trainTensor, 'val': Z_validTensor}
        XTensors_train = {'train': X_trainTensor, 'val': X_validTensor}

        YTensors_train = {'train': Y_trainTensor, 'val': Y_validTensor}
        AdjTensors_train = {'train': Adj_trainTensor, 'val': Adj_validTensor}

        if (args.GCNfeature == "x"):
            dim_GCNin = X_allTensor.shape[1]
            GCN_trainTensors = XTensors_train
            GCN_testTensor = X_testTensor
        else:
            dim_GCNin = Z_testTensor.shape[1]
            GCN_trainTensors = ZTensors_train
            GCN_testTensor = Z_testTensor

        model = GCNPredictor(input_feat_dim=dim_GCNin,
                             hidden_dim1=encoder_hdims[0],
                             hidden_dim2=dim_au_out,
                             dropout=0.5,
                             hidden_dims_predictor=preditor_hdims,
                             output_dim=dim_model_out,
                             pretrained_weights=encoder_path,
                             freezed=bool(args.freeze_pretrain))

        # model2 = GAEBase(input_dim=X_train_all.shape[1], latent_dim=128,h_dims=[512])
        # model2.to(device)
        # test = model2((X_trainTensor,Adj_trainTensor))

    logging.info(model)
    if torch.cuda.is_available():
        model.cuda()
    model.to(device)

    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=1e-2)

    if prediction == "regression":
        loss_function = nn.MSELoss()
    else:
        loss_function = nn.CrossEntropyLoss()

    exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer)

    if args.predictor == "GCN":
        model, report = t.train_GCNpreditor_model(model=model,
                                                  z=GCN_trainTensors,
                                                  y=YTensors_train,
                                                  adj=AdjTensors_train,
                                                  optimizer=optimizer,
                                                  loss_function=loss_function,
                                                  n_epochs=epochs,
                                                  scheduler=exp_lr_scheduler,
                                                  save_path=preditor_path)

    else:
        model, report = t.train_predictor_model(model,
                                                dataloaders_train,
                                                optimizer,
                                                loss_function,
                                                epochs,
                                                exp_lr_scheduler,
                                                load=load_model,
                                                save_path=preditor_path)
    if args.predictor != 'GCN':
        dl_result = model(X_testTensor).detach().cpu().numpy()
    else:
        dl_result = model(GCN_testTensor,
                          Adj_testTensor).detach().cpu().numpy()

    #torch.save(model.feature_extractor.state_dict(), preditor_path+"encoder.pkl")

    logging.info('Performances: R/Pearson/Mse/')

    if prediction == "regression":
        logging.info(r2_score(dl_result, Y_test))
        logging.info(pearsonr(dl_result.flatten(), Y_test.flatten()))
        logging.info(mean_squared_error(dl_result, Y_test))
    else:
        lb_results = np.argmax(dl_result, axis=1)
        #pb_results = np.max(dl_result,axis=1)
        pb_results = dl_result[:, 1]

        report_dict = classification_report(Y_test,
                                            lb_results,
                                            output_dict=True)
        report_df = pd.DataFrame(report_dict).T
        ap_score = average_precision_score(Y_test, pb_results)
        auroc_score = roc_auc_score(Y_test, pb_results)

        report_df['auroc_score'] = auroc_score
        report_df['ap_score'] = ap_score

        report_df.to_csv("saved/logs/" + reduce_model + args.predictor +
                         prediction + select_drug + now + '_report.csv')

        logging.info(classification_report(Y_test, lb_results))
        logging.info(average_precision_score(Y_test, pb_results))
        logging.info(roc_auc_score(Y_test, pb_results))

        model = DummyClassifier(strategy='stratified')
        model.fit(X_train, Y_train)
        yhat = model.predict_proba(X_test)
        naive_probs = yhat[:, 1]

        ut.plot_roc_curve(Y_test,
                          naive_probs,
                          pb_results,
                          title=str(roc_auc_score(Y_test, pb_results)),
                          path="saved/figures/" + reduce_model +
                          args.predictor + prediction + select_drug + now +
                          '_roc.pdf')
        ut.plot_pr_curve(Y_test,
                         pb_results,
                         title=average_precision_score(Y_test, pb_results),
                         path="saved/figures/" + reduce_model +
                         args.predictor + prediction + select_drug + now +
                         '_prc.pdf')
Exemple #4
0
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features, y_test, tx, ty, test_maks, true_labels = load_data(
        args.dataset_str)
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Before proceeding further, make the structure for doing deepWalk
    if args.dw == 1:
        print('Using deepWalk regularization...')
        G = load_edgelist_from_csr_matrix(adj_orig, undirected=True)
        print("Number of nodes: {}".format(len(G.nodes())))
        num_walks = len(G.nodes()) * args.number_walks
        print("Number of walks: {}".format(num_walks))
        data_size = num_walks * args.walk_length
        print("Data size (walks*length): {}".format(data_size))

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    if args.model == 'gcn_vae':
        model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    else:
        model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    if args.dw == 1:
        sg = SkipGram(args.hidden2, adj.shape[0])
        optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw)

        # Construct the nodes for doing random walk. Doing it before since the seed is fixed
        nodes_in_G = list(G.nodes())
        chunks = len(nodes_in_G) // args.number_walks
        random.Random().shuffle(nodes_in_G)

    hidden_emb = None
    for epoch in tqdm(range(args.epochs)):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        z, mu, logvar = model(features, adj_norm)

        # After back-propagating gae loss, now do the deepWalk regularization
        if args.dw == 1:
            sg.train()
            if args.full_number_walks > 0:
                walks = build_deepwalk_corpus(G,
                                              num_paths=args.full_number_walks,
                                              path_length=args.walk_length,
                                              alpha=0,
                                              rand=random.Random(SEED))
            else:
                walks = build_deepwalk_corpus_iter(
                    G,
                    num_paths=args.number_walks,
                    path_length=args.walk_length,
                    alpha=0,
                    rand=random.Random(SEED),
                    chunk=epoch % chunks,
                    nodes=nodes_in_G)
            for walk in walks:
                if args.context == 1:
                    # Construct the pairs for predicting context node
                    # for each node, treated as center word
                    curr_pair = (int(walk[center_node_pos]), [])
                    for center_node_pos in range(len(walk)):
                        # for each window position
                        for w in range(-args.window_size,
                                       args.window_size + 1):
                            context_node_pos = center_node_pos + w
                            # make soure not jump out sentence
                            if context_node_pos < 0 or context_node_pos >= len(
                                    walk
                            ) or center_node_pos == context_node_pos:
                                continue
                            context_node_idx = walk[context_node_pos]
                            curr_pair[1].append(int(context_node_idx))
                else:
                    # first item in the walk is the starting node
                    curr_pair = (int(walk[0]), [
                        int(context_node_idx) for context_node_idx in walk[1:]
                    ])

                if args.ns == 1:
                    neg_nodes = []
                    pos_nodes = set(walk)
                    while len(neg_nodes) < args.walk_length - 1:
                        rand_node = random.randint(0, n_nodes - 1)
                        if rand_node not in pos_nodes:
                            neg_nodes.append(rand_node)
                    neg_nodes = torch.from_numpy(np.array(neg_nodes)).long()

                # Do actual prediction
                src_node = torch.from_numpy(np.array([curr_pair[0]])).long()
                tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long()
                optimizer_dw.zero_grad()
                log_pos = sg(src_node, tgt_nodes, neg_sample=False)
                if args.ns == 1:
                    loss_neg = sg(src_node, neg_nodes, neg_sample=True)
                    loss_dw = log_pos + loss_neg
                else:
                    loss_dw = log_pos
                loss_dw.backward(retain_graph=True)
                cur_dw_loss = loss_dw.item()
                optimizer_dw.step()

        loss = loss_function(preds=model.dc(z),
                             labels=adj_label,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm,
                             pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges,
                                          val_edges_false)

        if args.dw == 1:
            tqdm.write(
                "Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}"
                .format(epoch + 1, cur_loss, cur_dw_loss, ap_curr,
                        time.time() - t))
        else:
            tqdm.write(
                "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}".
                format(epoch + 1, cur_loss, ap_curr,
                       time.time() - t))

        if (epoch + 1) % 10 == 0:
            tqdm.write("Evaluating intermediate results...")
            kmeans = KMeans(n_clusters=args.n_clusters,
                            random_state=0).fit(hidden_emb)
            predict_labels = kmeans.predict(hidden_emb)
            cm = clustering_metrics(true_labels, predict_labels)
            cm.evaluationClusterModelFromLabel(tqdm)
            roc_score, ap_score = get_roc_score(hidden_emb, adj_orig,
                                                test_edges, test_edges_false)
            tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score))
            np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb)

    tqdm.write("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges,
                                        test_edges_false)
    tqdm.write('Test ROC score: ' + str(roc_score))
    tqdm.write('Test AP score: ' + str(ap_score))
    kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb)
    predict_labels = kmeans.predict(hidden_emb)
    cm = clustering_metrics(true_labels, predict_labels)
    cm.evaluationClusterModelFromLabel(tqdm)

    if args.plot == 1:
        cm.plotClusters(tqdm, hidden_emb, true_labels)
Exemple #5
0
def gae_for(args, position):
    print("Using {} dataset".format(args.dataset_str))
    #qhashes, chashes = load_hashes()
    Q, X = load_data()
    prebuild = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/GEM_wDis_prebuild.bin"
    Q_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_lw_query_feats.npy"  #"/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy"
    X_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_index.npy"
    D_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_Dis.npy"
    adj, features, adj_Q, features_Q = load_from_prebuild(prebuild,
                                                          Q_features,
                                                          X_features,
                                                          D_features,
                                                          k=5)  # ----> 1M
    #cut_size = 800000
    #adj = adj[:cut_size, :cut_size]
    #adj_Q = adj_Q[:, :cut_size]
    #features = features[:cut_size]
    #Q = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy").T.astype(np.float32)
    #X = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_index_fused.npy").T.astype(np.float32)
    #D = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/revisitop1m/revisitDistractors_fused_3s_cq.npy").T.astype(np.float32)
    #X = np.concatenate((X.T,D.T)).T
    # load the distractor too, shape should be (2048, 1M)
    #adj, features = gen_graph_index(Q, X, k=5, k_qe=3, do_qe=False) #-----> 5k

    #adj_Q, features_Q = gen_graph(Q, X, k=5, k_qe=3, do_qe=False) #generate validation/revop evaluation the same way as training ----> 5k
    features_all = np.concatenate([features_Q, features])
    features_all = torch.from_numpy(features_all)
    #adj_Q = adj_Q.todense()
    #adj_all = np.concatenate([adj_Q, adj.todense()])
    #adj_all = np.pad(adj_all, [[0,0], [Q.shape[1], 0]], "constant")
    adj_all = sp.vstack((adj_Q, adj))
    zeros = sp.csr_matrix((adj_all.shape[0], Q.shape[1]))
    adj_all = sp.hstack((zeros, adj_all))
    adj_all = sp.csr_matrix(adj_all)
    rows, columns = adj_all.nonzero()
    print("Making Symmetry")
    for i in range(rows.shape[0]):
        if rows[i] < Q.shape[1]:
            adj_all[columns[i], rows[i]] = adj_all[rows[i], columns[i]]
        else:
            break
    #adj_all = sp.csr_matrix(adj_all)
    print("preprocessing adj_all")
    adj_all_norm = preprocess_graph(adj_all)
    #adj = add_neighbours_neighbour(adj)
    #adj1, features1 = load_data(args.dataset_str)
    features = torch.from_numpy(features)
    #features_all = torch.from_numpy(features_all)
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    adj = adj_orig

    #adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
    print("Sampling validation")
    adj_train, adj_val, features, features_valid = mask_test_rows(
        adj, features)
    adj = adj_train

    # Some preprocessing
    print("preprocessing adj")
    adj_norm = preprocess_graph(adj)
    #adj_norm_label = preprocess_graph_sp(adj)

    adj_label = adj_train + sp.eye(
        adj_train.shape[0]
    )  #adj_norm_label + sp.eye(adj_train.shape[0]) #adj_train + sp.eye(adj_train.shape[0])
    #rows, columns = adj_label.nonzero()
    #adj_label[columns, rows] = adj_label[rows, columns]
    # adj_label = sparse_to_tuple(adj_label)
    #adj_label = torch.FloatTensor(adj_label.toarray())

    print("adj sum: " + str(adj.sum()))
    pos_weight = float(float(adj.shape[0]) * adj.shape[0] -
                       adj.sum()) / adj.sum()
    print("top part: " +
          str(float(float(adj.shape[0]) * adj.shape[0] - adj.sum())))
    print("pos wieght: " + str(pos_weight))
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    # for validation data processing:
    zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0]))
    adj_train_ext = sp.hstack((zero, adj_train))
    adj_evaluate = sp.vstack((adj_val, adj_train_ext))
    adj_evaluate = sp.csr_matrix(adj_evaluate)
    rows, columns = adj_evaluate.nonzero()
    val_edges = []
    val_edges_false = []
    pos = {}
    print("getting positive edges")
    all_val = [i for i in range(len(rows)) if rows[i] < adj_val.shape[0]]
    for i in all_val:
        sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "]")
        val_edges.append((rows[i], columns[i]))
        if rows[i] not in pos:
            pos[rows[i]] = []
        pos[rows[i]].append(columns[i])
    #for i in range(rows.shape[0]):
    #    sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "/" + str(adj_val.shape[0]) + "]")
    #    sys.stdout.flush()
    #    if rows[i] < adj_val.shape[0]:
    #        val_edges.append((rows[i], columns[i]))
    #        if rows[i] not in pos:
    #            pos[rows[i]] = []
    #        pos[rows[i]].append(columns[i])
    #        adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]]
    #    else:
    #        break
    step = 0
    neg_per_pos = 100
    #for r in pos:
    #    p = pos[r]
    #neg_edges = Parallel(n_jobs=40)(delayed(neg_sample)(pos[i], adj_val.shape[1], neg_per_pos, i) for i in pos)
    #val_edges_false = [(i, item) for i in range(len(neg_edges)) for item in neg_edges[i]]
    #a = np.random.permutation(adj_val.shape[1])
    #a = [i for i in a if i not in p]
    #a = a[:100]
    #for i in a:
    #    val_edges_false.append((r, i))
    ##count = 0
    ##i = 0
    #sys.stdout.write("\r sampling neg edges for validtion: [" + str(step) + "/" + str(len(pos)) + "]")
    #sys.stdout.flush()
    #step += 1
    #while count < 100:
    #    if a[i] not in p:
    #        val_edges_false.append((r, a[i]))
    #        count += 1
    #    i += 1

    print("preprocessing adj_evaluate")
    adj_evaluate_norm = preprocess_graph(adj_evaluate)
    #adj_evaluate_norm_label = preprocess_graph_sp(adj_evaluate)

    adj_label_evaluate = adj_evaluate + sp.eye(
        adj_evaluate.shape[0]
    )  #adj_evaluate_norm_label+ sp.eye(adj_evaluate.shape[0]) #adj_evaluate + sp.eye(adj_evaluate.shape[0])
    #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) #sparse_mx_to_torch_sparse_tensor(adj_label_evaluate)
    features_evaluate = np.concatenate([features, features_valid])
    features_evaluate = torch.from_numpy(features_evaluate)
    # validation done

    if mode == "VAE":
        model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
        adj_label = torch.FloatTensor(adj_label.toarray())
        adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray())
    elif mode == "AE":
        model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
        #adj_label = torch.FloatTensor(adj_label.toarray())
        #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray())
    elif mode == "VAE_batch":
        model = GCNModelVAE_batch(feat_dim, args.hidden1, args.hidden2,
                                  args.dropout)
    elif mode == "AE_batch":
        model = GCNModelAE_batch(feat_dim, args.hidden1, args.hidden2,
                                 args.dropout).cuda()
    #model = torch.nn.DataParallel(model)
    #model = model.cuda()

    # train_dataset = GAEDataset(adj_norm, adj_label, features)
    # train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size,
    #                           shuffle=True, num_workers=8, pin_memory=True)
    train_ids = torch.tensor(range(features.shape[0]), dtype=torch.long)
    train_dataset = TensorDataset(train_ids)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    #optimizer = pSGLD(model.parameters(), lr=args.lr)
    #optimizer = optim.RMSprop(model.parameters(), lr=args.lr)

    hidden_emb = None
    pos_weight = torch.from_numpy(np.array(0.0, dtype=np.float32))
    #ipdb.set_trace()
    t = time.time()
    best = 0
    best_epoch = 0
    best_val_cost = 99999
    best_val_epoch = 0
    best_val_epoch_revop = 0
    prev_loss = 0
    prev_val_loss = 99999
    best_val_roc = 0
    best_val_ap = 0
    best_val_roc_revop = 0
    best_val_ap_revop = 0
    torch.set_num_threads(20)
    print("NUM THREADS USED")
    print(torch.get_num_threads())
    for epoch in range(args.epochs):
        #t = time.time()
        model.train()

        lossVal = 0
        lossValNorm = 0
        backtime = time.time()
        for batchID, (inds) in enumerate(train_loader):
            z = model(features, adj_norm)
            inds = inds[0]
            adj = F.relu(torch.mm(z[inds], z[inds].t()))
            preds = adj
            label_batch = torch.FloatTensor(adj_label[inds, :][:,
                                                               inds].toarray())

            cost = norm * F.binary_cross_entropy_with_logits(
                preds, label_batch, pos_weight=pos_weight)
            lossVal += cost.item()
            lossValNorm += 1

            optimizer.zero_grad()

            cost.backward(retain_graph=True)

            # if batchID == 0:
            #     cost.backward(retain_graph=True)
            # else:
            #     cost.backward()

            optimizer.step()
            if batchID >= 10:
                break

        backtime_done = time.time() - backtime
        sys.stdout.write("\r time taken to do epoch: " + str(backtime_done) +
                         "   opt: " + str(time.time() - backtime) + "\n")
        sys.stdout.flush()

        #optimizer.step()
        # sample rows only: non-square
        #for i in range(0, d.shape[0], sample_size):
        #    selection = random_perm[i:i+sample_size]
        #    to_keep = selection
        #    sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :]))
        #    sample_adj = adj[to_keep, :]
        #    sample_features = features
        #    recovered, mu, logvar = model(sample_features, sample_adj_norm)
        #    loss = loss_function(preds=recovered, labels=sample_adj_label,
        #                         mu=mu, logvar=logvar, n_nodes=n_nodes,
        #                         norm=norm, pos_weight=pos_weight)
        #    loss.backward()
        #    cur_loss = loss.item()
        #    optimizer.step()
        #    sys.stdout.write("\r                                                                                                                                       ")
        #    sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0]))
        #    sys.stdout.flush()

        # sample rows + take their postiives and add to rows (make it square)
        #for i in range(0, d.shape[0], sample_size):
        #    selection = random_perm[i:i+sample_size]
        #    to_keep = np.nonzero(d[selection, :])
        #    # (array([0, 1, 2, 2]), array([0, 1, 0, 1]))
        #    the_set = set(list(to_keep[0]) + list(to_keep[1]))
        #    temp = set(list(to_keep[0]))
        #    to_keep = list(the_set - column_exclude)
        #    # column_exclude.union(temp)
        #    # these ar ethe rows and columns that we need ne select
        #    sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :][:,to_keep]))
        #    sample_features = features[to_keep, :]
        #    sample_adj_label = adj_label[to_keep, :][:,to_keep]
        #    #print(samplei_adj_norm.shape)
        #    #print(sample_features.shape)
        #    #print(sample_adj_label.shape)
        #    #print(sample.shape)
        #    sample_adj = adj[to_keep, :][:,to_keep]
        #    pos_weight = float(sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) / sample_adj.sum()
        #    pos_weight = torch.from_numpy(np.array(pos_weight))
        #    norm = sample_adj.shape[0] * sample_adj.shape[0] / float((sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) * 2)

        #    n_nodes, feat_dim = sample_features.shape

        #    if mode == "VAE":
        #        recovered, mu, logvar = model(sample_features, sample_adj_norm)
        #        #recovered = recovered[i:i+500]
        #        #sample_adj_label = sample_adj_label[i:i+500]
        #        loss = loss_function(preds=recovered, labels=sample_adj_label,
        #                             mu=mu, logvar=logvar, n_nodes=n_nodes,
        #                             norm=norm, pos_weight=pos_weight)
        #    elif mode == "AE":
        #        recovered = model(sample_features, sample_adj_norm)
        #        loss = loss_function_ae(preds=recovered, labels=sample_adj_label,
        #                                norm=norm, pos_weight=pos_weight)
        #    loss.backward()
        #    cur_loss = loss.item()
        #    optimizer.step()
        #    sys.stdout.write("\r                                                                                                                                       ")
        #    sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0]) + "]....size of sample=" + str(len(sample_features)))
        #    sys.stdout.flush()

        sys.stdout.write(
            "\r                                                                                                                                          \r"
        )
        if (epoch + 1) % 1 == 0:
            model.eval()
            #adj_dense = adj_train.todense()
            #adj_val_dense = adj_val.todense()
            #adj_train_ext = np.pad(adj_dense, [[0,0], [adj_val_dense.shape[0], 0]], "constant")
            #adj_evaluate = np.concatenate([adj_val_dense, adj_train_ext])
            #zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0]))
            #adj_train_ext = sp.hstack((zero, adj_train))
            #adj_evaluate = sp.vstack((adj_val, adj_train_ext))
            ##zeros = sp.csr_matrix((adj_evaluate.shape[0], adj_val.shape[1]))
            ##adj_evaluate = sp.hstack((zeros, adj_evaluate))
            #adj_evaluate = sp.csr_matrix(adj_evaluate)
            #rows, columns = adj_evaluate.nonzero()
            #for i in range(rows.shape[0]):
            #    if rows[i] < adj_val.shape[1]:
            #        adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]]
            #    else:
            #        break

            #adj_evaluate_norm = preprocess_graph(adj_evaluate)

            #adj_label_evaluate = adj_evaluate + sp.eye(adj_evaluate.shape[0])
            #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray())
            ##adj_label_evaluate = sparse_to_tuple(adj_label_evaluate)

            #features_evaluate = np.concatenate([features, features_valid])
            #features_evaluate = torch.from_numpy(features_evaluate)
            just_adj_evaluate = sparse_mx_to_torch_sparse_tensor(adj_evaluate)
            #recovered, mu, logvar = model(features_evaluate, just_adj_evaluate.coalesce().indices(), just_adj_evaluate.coalesce().values())
            #recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm)
            #val_loss = loss_function(preds=recovered, labels=adj_label_evaluate,
            #                         mu=mu, logvar=logvar, n_nodes=n_nodes,
            #                         norm=norm, pos_weight=pos_weight)
            # if mode == "VAE":
            #     recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm)
            #     val_loss = loss_function(preds=recovered, labels=adj_label_evaluate,
            #                              mu=mu, logvar=logvar, n_nodes=n_nodes,
            #                              norm=norm, pos_weight=pos_weight)
            # elif mode == "AE":
            #     mu = model(features_evaluate, adj_evaluate_norm)
            #     val_loss = loss_function_ae(preds=recovered, labels=adj_label_evaluate,
            #                              norm=norm, pos_weight=pos_weight)
            # elif mode == "VAE_batch":
            #     recovered, mu, logvar, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False)
            # elif mode == "AE_batch":
            #     recovered, mu, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False)
            # val_emb = mu.data.numpy()
            #roc_curr, ap_curr = get_roc_score(val_emb, val_edges, val_edges_false)

            # do one q at a time
            #revop_map = eval_each_q(model, adj_all, features_all, Q.shape[1])

            # hack by appending stuff on top of adj
            if mode == "VAE":
                _, mu, _ = model(features_all, adj_all_norm)
            elif mode == "AE":
                mu = model(features_all, adj_all_norm)
            elif mode == "VAE_batch":
                mu = model(features_all,
                           adj_all_norm,
                           None,
                           None,
                           None,
                           None,
                           None,
                           just_mu=True,
                           training=False)
            elif mode == "AE_batch":
                mu = model(features_all,
                           adj_all_norm,
                           None,
                           None,
                           None,
                           None,
                           None,
                           just_mu=True,
                           training=False)
            hidden_emb = mu.data.numpy()
            ## get validation loss
            #recovered, mu, logvar = model(features, adj_norm)
            #val_loss = loss_function(preds=recovered, labels=adj_label,
            #                         mu=mu, logvar=logvar, n_nodes=n_nodes,
            #                         norm=norm, pos_weight=pos_weight)
            revop_map = get_roc_score_matrix(hidden_emb, Q.shape[1])

            if best <= revop_map:
                emb = hidden_emb
                Q_end = Q.shape[1]
                best = revop_map
                best_epoch = epoch + 1
                # write it into a file and do egt on that
                #embQ = emb[:Q_end,:].T
                #embX = emb[Q_end:,:].T
                #np.save("/media/jason/28c9eee1-312e-47d0-88ce-572813ebd6f1/graph/gae-pytorch/best_embedding2.npy",hidden_emb)
                #concat = np.concatenate((embQ.T,embX.T))
                #revop_inner_prod = np.matmul(concat, concat.T)
                #revop_preds = np.argsort(-revop_inner_prod,axis=0)
                #if revop_map > 54:
                #    f = open("best_result.txt", "w")
                #    for i in range(revop_preds.shape[1]):
                #        if i < Q_end:
                #            f.write(qhashes[i] + ",")
                #        else:
                #            f.write(chashes[i - Q_end] + ",")
                #        for j in revop_preds[:,i]:
                #            if j < Q_end:
                #                f.write(qhashes[j] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ")
                #            else:
                #                f.write(chashes[j - Q_end] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ")
                #        f.write("\n")
                #        f.flush()
                #        #for j in range()
                #    f.close()

            if best_val_cost > -99.0:  #prev_val_loss - val_loss > 0 and prev_val_loss - val_loss > prev_loss - cur_loss and best_val_cost > val_loss:
                best_val_cost = -99.0
                best_val_epoch = epoch + 1
                best_val_epoch_revop = revop_map

            #if best_val_roc < roc_curr:
            #    best_val_roc = roc_curr
            #    best_val_roc_revop = revop_map

            #if best_val_ap < ap_curr:
            #    best_val_ap = ap_curr
            #    best_val_ap_revop = revop_map
            print(
                "Epoch:",
                '%04d' % (epoch + 1),
                "train_loss=",
                "{:.5f}".format(lossVal / lossValNorm),
                "val_loss=",
                "{:.5f}".format(-99.0),
                #"val_roc_curr=", "{:.5f}".format(roc_curr),
                #"val_ap_curr=", "{:.5f}".format(ap_curr),
                "revop=",
                "{:.5f}".format(revop_map),
                "best_revop=",
                "{:.5f}".format(best),
                "revop_at_best_val=",
                "{:.5f}".format(best_val_epoch_revop),
                #"revop_at_best_val_roc=", "{:.5f}".format(best_val_roc_revop),
                #"revop_at_best_ap_roc=", "{:.5f}".format(best_val_ap_revop),
                "time=",
                "{:.5f}".format(time.time() - t))
            prev_val_loss = -99.0
            prev_loss = -99.0
            t = time.time()
    print("Optimization Finished!")

    #roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
    #print('Test ROC score: ' + str(roc_score))
    #print('Test AP score: ' + str(ap_score))
    return best, best_val_epoch_revop, best_val_roc_revop, best_val_ap_revop
Exemple #6
0
def GAEembedding(z, adj, args):
    '''
    GAE embedding for clustering
    Param:
        z,adj
    Return:
        Embedding from graph
    '''
    # true_labels = np.asarray(true_labels)

    # args.model = 'gcn_vae'
    # args.dw    = 0
    # args.epochs = 200
    # args.hidden1 = 32
    # args.hidden2 = 16
    # args.lr      = 0.01
    # args.dropout = 0.
    # args.dataset_sr = 'cora'
    # args.walk_length = 5
    # args.window_size = 3
    # args.number_walks = 5
    # args.full_number_walks =0
    # args.lr_dw   = 0.001
    # args.context = 0
    # args.ns = 1
    # args.n_clusters = 11
    # args.plot = 0

    # featrues from z
    # Louvain
    features = z
    # features = torch.DoubleTensor(features)
    features = torch.FloatTensor(features)

    # Old implementation
    # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str)

    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Before proceeding further, make the structure for doing deepWalk
    # if args.dw == 1:
    #     print('Using deepWalk regularization...')
    #     G = load_edgelist_from_csr_matrix(adj_orig, undirected=True)
    #     print("Number of nodes: {}".format(len(G.nodes())))
    #     num_walks = len(G.nodes()) * args.number_walks
    #     print("Number of walks: {}".format(num_walks))
    #     data_size = num_walks * args.walk_length
    #     print("Data size (walks*length): {}".format(data_size))

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    # adj_label = torch.DoubleTensor(adj_label.toarray())
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    if args.GAEmodel == 'gcn_vae':
        model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2,
                            args.GAEdropout)
    else:
        model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2,
                           args.GAEdropout)
    if args.precisionModel == 'Double':
        model = model.double()
    optimizer = optim.Adam(model.parameters(), lr=args.GAElr)

    # if args.dw == 1:
    #     sg = SkipGram(args.hidden2, adj.shape[0])
    #     optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw)

    #     # Construct the nodes for doing random walk. Doing it before since the seed is fixed
    #     nodes_in_G = list(G.nodes())
    #     chunks = len(nodes_in_G) // args.number_walks
    #     random.Random().shuffle(nodes_in_G)

    hidden_emb = None
    for epoch in tqdm(range(args.GAEepochs)):
        t = time.time()
        # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        # print('Mem consumption before training: '+str(mem))
        model.train()
        optimizer.zero_grad()
        z, mu, logvar = model(features, adj_norm)

        # After back-propagating gae loss, now do the deepWalk regularization
        # if args.dw == 1:
        #     sg.train()
        #     if args.full_number_walks > 0:
        #         walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks,
        #                                       path_length=args.walk_length, alpha=0,
        #                                       rand=random.Random(SEED))
        #     else:
        #         walks = build_deepwalk_corpus_iter(G, num_paths=args.number_walks,
        #                                            path_length=args.walk_length, alpha=0,
        #                                            rand=random.Random(SEED),
        #                                            chunk=epoch % chunks,
        #                                            nodes=nodes_in_G)
        #     for walk in walks:
        #         if args.context == 1:
        #             # Construct the pairs for predicting context node
        #             # for each node, treated as center word
        #             curr_pair = (int(walk[center_node_pos]), [])
        #             for center_node_pos in range(len(walk)):
        #                 # for each window position
        #                 for w in range(-args.window_size, args.window_size + 1):
        #                     context_node_pos = center_node_pos + w
        #                     # make soure not jump out sentence
        #                     if context_node_pos < 0 or context_node_pos >= len(walk) or center_node_pos == context_node_pos:
        #                         continue
        #                     context_node_idx = walk[context_node_pos]
        #                     curr_pair[1].append(int(context_node_idx))
        #         else:
        #             # first item in the walk is the starting node
        #             curr_pair = (int(walk[0]), [int(context_node_idx) for context_node_idx in walk[1:]])

        #         if args.ns == 1:
        #             neg_nodes = []
        #             pos_nodes = set(walk)
        #             while len(neg_nodes) < args.walk_length - 1:
        #                 rand_node = random.randint(0, n_nodes - 1)
        #                 if rand_node not in pos_nodes:
        #                     neg_nodes.append(rand_node)
        #             neg_nodes = torch.from_numpy(np.array(neg_nodes)).long()

        #         # Do actual prediction
        #         src_node = torch.from_numpy(np.array([curr_pair[0]])).long()
        #         tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long()
        #         optimizer_dw.zero_grad()
        #         log_pos = sg(src_node, tgt_nodes, neg_sample=False)
        #         if args.ns == 1:
        #             loss_neg = sg(src_node, neg_nodes, neg_sample=True)
        #             loss_dw = log_pos + loss_neg
        #         else:
        #             loss_dw = log_pos
        #         loss_dw.backward(retain_graph=True)
        #         cur_dw_loss = loss_dw.item()
        #         optimizer_dw.step()

        loss = loss_function(preds=model.dc(z),
                             labels=adj_label,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm,
                             pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        # TODO, this is prediction
        # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)
        ap_curr = 0

        # if args.dw == 1:
        #     tqdm.write("Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}".format(
        #         epoch + 1, cur_loss, cur_dw_loss,
        #         ap_curr, time.time() - t))
        # else:
        tqdm.write(
            "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}".
            format(epoch + 1, cur_loss, ap_curr,
                   time.time() - t))

        # if (epoch + 1) % 10 == 0:
        #     tqdm.write("Evaluating intermediate results...")
        #     kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb)
        #     predict_labels = kmeans.predict(hidden_emb)
        #     cm = clustering_metrics(true_labels, predict_labels)
        #     cm.evaluationClusterModelFromLabel(tqdm)
        #     roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
        #     tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score))
        #     np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb)

    tqdm.write("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges,
                                        test_edges_false)
    tqdm.write('Test ROC score: ' + str(roc_score))
    tqdm.write('Test AP score: ' + str(ap_score))
    # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb)
    # predict_labels = kmeans.predict(hidden_emb)
    # cm = clustering_metrics(true_labels, predict_labels)
    # cm.evaluationClusterModelFromLabel(tqdm)

    # if args.GAEplot == 1:
    #     cm.plotClusters(tqdm, hidden_emb, true_labels)

    return hidden_emb
Exemple #7
0
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features = load_data(args.dataset_str)
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    lst_result = []
    for i in range(10):
        model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
        optimizer = optim.Adam(model.parameters(), lr=args.lr)

        hidden_emb = None
        max_roc_ap = 0
        for epoch in range(args.epochs):
            t = time.time()
            model.train()
            optimizer.zero_grad()
            recovered, mu, logvar = model(features, adj_norm)
            loss = loss_function(preds=recovered,
                                 labels=adj_label,
                                 mu=mu,
                                 logvar=logvar,
                                 n_nodes=n_nodes,
                                 norm=norm,
                                 pos_weight=pos_weight)
            loss.backward()
            cur_loss = loss.item()
            optimizer.step()

            hidden_emb = mu.data.numpy()

            roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges,
                                              val_edges_false)
            roc_ap = roc_curr + ap_curr
            if max_roc_ap < roc_ap:
                max_roc_ap = roc_ap
                h_emb_best_model = hidden_emb

            print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
                  "{:.5f}".format(cur_loss), "val_ap=",
                  "{:.5f}".format(ap_curr), "time=",
                  "{:.5f}".format(time.time() - t))
            roc_score, ap_score = get_roc_score(hidden_emb, adj_orig,
                                                test_edges, test_edges_false)
            print('Test ROC score: ' + str(roc_score))
            print('Test AP score: ' + str(ap_score))
            print("---------------------------------------")
        print("Optimization Finished!: ", i)
        roc_score, ap_score = get_roc_score(h_emb_best_model, adj_orig,
                                            test_edges, test_edges_false)

        # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
        lst_result.append([i, roc_score, ap_score])

        print('Test ROC score: ' + str(roc_score))
        print('Test AP score: ' + str(ap_score))
    lst_result = np.array(lst_result)
    csv_info = np.append(
        lst_result,
        [["mean", np.mean(lst_result[:, 1]),
          np.mean(lst_result[:, 2])]],
        axis=0)
    csv_info = np.append(
        csv_info,
        [["std", np.std(lst_result[:, 1]),
          np.std(lst_result[:, 2])]],
        axis=0)

    t = int(time.time())
    folder = Path(os.path.join(os.getcwd(), "csv"))
    csv_name = "{}_{}_{}_{}_{}.csv".format(args.dataset_str, args.epochs,
                                           args.hidden1, args.hidden2, t)

    df = pd.DataFrame(csv_info, columns=['run', 'ROC', "AP"])
    df.to_csv(os.path.join(folder, csv_name))
Exemple #8
0
def GAEembedding(z, adj, args):
    '''
    GAE embedding for clustering
    Param:
        z,adj
    Return:
        Embedding from graph
    '''
    # featrues from z
    # Louvain
    features = z
    # features = torch.DoubleTensor(features)
    features = torch.FloatTensor(features)

    # Old implementation
    # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str)

    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    # adj_label = torch.DoubleTensor(adj_label.toarray())
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    if args.GAEmodel == 'gcn_vae':
        model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2,
                            args.GAEdropout)
    else:
        model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2,
                           args.GAEdropout)
    if args.precisionModel == 'Double':
        model = model.double()
    optimizer = optim.Adam(model.parameters(), lr=args.GAElr)

    hidden_emb = None
    for epoch in tqdm(range(args.GAEepochs)):
        t = time.time()
        # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        # print('Mem consumption before training: '+str(mem))
        model.train()
        optimizer.zero_grad()
        z, mu, logvar = model(features, adj_norm)

        loss = loss_function(preds=model.dc(z),
                             labels=adj_label,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm,
                             pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        # TODO, this is prediction
        # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)
        ap_curr = 0

        tqdm.write(
            "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}".
            format(epoch + 1, cur_loss, ap_curr,
                   time.time() - t))

    tqdm.write("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges,
                                        test_edges_false)
    tqdm.write('Test ROC score: ' + str(roc_score))
    tqdm.write('Test AP score: ' + str(ap_score))

    return hidden_emb