Esempio n. 1
0
def test_gae():
    model = GAE(encoder=lambda x: x)
    model.reset_parameters()

    x = torch.Tensor([[1, -1], [1, 2], [2, 1]])
    z = model.encode(x)
    assert z.tolist() == x.tolist()

    adj = model.decode(z)
    assert adj.tolist() == torch.sigmoid(
        torch.Tensor([[+2, -1, +1], [-1, +5, +4], [+1, +4, +5]])).tolist()

    edge_index = torch.tensor([[0, 1], [1, 2]])
    value = model.decode_indices(z, edge_index)
    assert value.tolist() == torch.sigmoid(torch.Tensor([-1, 4])).tolist()

    edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                               [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
    data = Data(edge_index=edge_index)
    data = model.split_edges(data, val_ratio=0.2, test_ratio=0.3)

    assert data.val_pos_edge_index.size() == (2, 2)
    assert data.val_neg_edge_index.size() == (2, 2)
    assert data.test_pos_edge_index.size() == (2, 3)
    assert data.test_neg_edge_index.size() == (2, 3)
    assert data.train_pos_edge_index.size() == (2, 5)
    assert data.train_neg_adj_mask.size() == (11, 11)
    assert data.train_neg_adj_mask.sum().item() == (11**2 - 11) / 2 - 4 - 6 - 5

    z = torch.randn(11, 16)
    loss = model.recon_loss(z, data.train_pos_edge_index)
    assert loss.item() > 0

    auc, ap = model.test(z, data.val_pos_edge_index, data.val_neg_edge_index)
    assert auc >= 0 and auc <= 1 and ap >= 0 and ap <= 1
Esempio n. 2
0
def test_gae():
    model = GAE(encoder=lambda x: x)
    model.reset_parameters()

    x = torch.Tensor([[1, -1], [1, 2], [2, 1]])
    z = model.encode(x)
    assert z.tolist() == x.tolist()

    adj = model.decoder.forward_all(z)
    assert adj.tolist() == torch.sigmoid(
        torch.Tensor([[+2, -1, +1], [-1, +5, +4], [+1, +4, +5]])).tolist()

    edge_index = torch.tensor([[0, 1], [1, 2]])
    value = model.decode(z, edge_index)
    assert value.tolist() == torch.sigmoid(torch.Tensor([-1, 4])).tolist()

    edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                               [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
    data = Data(edge_index=edge_index)
    data.num_nodes = edge_index.max().item() + 1
    data = train_test_split_edges(data, val_ratio=0.2, test_ratio=0.3)

    z = torch.randn(11, 16)
    loss = model.recon_loss(z, data.train_pos_edge_index)
    assert loss.item() > 0

    auc, ap = model.test(z, data.val_pos_edge_index, data.val_neg_edge_index)
    assert auc >= 0 and auc <= 1 and ap >= 0 and ap <= 1
Esempio n. 3
0
def test_gae():
    model = GAE(encoder=lambda x: x)
    model.reset_parameters()

    x = torch.Tensor([[1, -1], [1, 2], [2, 1]])
    z = model.encode(x)
    assert z.tolist() == x.tolist()

    adj = model.decoder.forward_all(z)
    assert adj.tolist() == torch.sigmoid(
        torch.Tensor([[+2, -1, +1], [-1, +5, +4], [+1, +4, +5]])).tolist()

    edge_index = torch.tensor([[0, 1], [1, 2]])
    value = model.decode(z, edge_index)
    assert value.tolist() == torch.sigmoid(torch.Tensor([-1, 4])).tolist()

    edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                               [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
    data = Data(edge_index=edge_index, num_nodes=11)
    transform = RandomLinkSplit(split_labels=True,
                                add_negative_train_samples=False)
    train_data, val_data, test_data = transform(data)

    z = torch.randn(11, 16)
    loss = model.recon_loss(z, train_data.pos_edge_label_index)
    assert loss.item() > 0

    auc, ap = model.test(z, val_data.pos_edge_label_index,
                         val_data.neg_edge_label_index)
    assert auc >= 0 and auc <= 1 and ap >= 0 and ap <= 1
Esempio n. 4
0
    def fit_model_once(self):

        #GCN
        if self.model_type == "GCN":
            encoder = EncoderGCN(in_channels=self.dataset.num_features,
                                 out_channels=32)

        #SAGE
        if self.model_type == "SAGE":
            encoder = EncoderSAGE(in_channels=self.dataset.num_features,
                                  out_channels=32)

        #GIN
        if self.model_type == "GIN":
            encoder = EncoderGIN(in_channels=self.dataset.num_features,
                                 out_channels=32)

        #GAT
        if self.model_type == "GAT":
            encoder = EncoderGAT(in_channels=self.dataset.num_features,
                                 out_channels=16,
                                 heads=8)

        #AGNN
        if self.model_type == "AGNN":
            encoder = EncoderAGNN(in_channels=self.dataset.num_features,
                                  out_channels=16)

        #GraphUNet
        if self.model_type == "GraphUNet":
            encoder = EncoderGraphUNet(in_channels=self.dataset.num_features,
                                       hidden_channels=32,
                                       out_channels=16)

        model = GAE(encoder=encoder, decoder=InnerProductDecoder())

        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        trainer = TrainerGae(model,
                             self.device,
                             self.data,
                             writer_path='runs/{}/{}/'.format(
                                 self.model_type, self.text_encoding) +
                             self.time_mark())
        model = trainer.fit(optimizer,
                            patience=self.patience,
                            num_epochs=self.num_epochs)

        auc, ap = trainer.evaluate(validation=False, test=True)
        return model, auc, ap
Esempio n. 5
0
def load_data(dataset_name):
    if dataset_name in ['cora', 'citeseer', 'pubmed']:
        path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.',
                            'data', dataset_name)
        data = Planetoid(path, dataset_name)[0]
    else:
        data = load_wiki.load_data()

    data.edge_index = gutils.to_undirected(data.edge_index)
    data = GAE.split_edges(GAE, data)

    features = data.x.numpy()
    train_pos_edges = data.train_pos_edge_index.numpy()
    train_neg_edges = sample_negative(count=train_pos_edges.shape[1],
                                      avoid=train_pos_edges,
                                      nodes=features.shape[0])

    x_tr, y_tr = combine_node_pair_features(features, train_pos_edges,
                                            train_neg_edges)
    x_val, y_val = combine_node_pair_features(features,
                                              data.val_pos_edge_index.numpy(),
                                              data.val_neg_edge_index.numpy())
    x_test, y_test = combine_node_pair_features(
        features, data.test_pos_edge_index.numpy(),
        data.test_neg_edge_index.numpy())
    return x_tr, y_tr, x_val, y_val, x_test, y_test
Esempio n. 6
0
def test_init():
    encoder = torch.nn.Linear(16, 32)
    decoder = torch.nn.Linear(32, 16)
    discriminator = torch.nn.Linear(32, 1)

    GAE(encoder, decoder)
    VGAE(encoder, decoder)
    ARGA(encoder, discriminator, decoder)
    ARGVA(encoder, discriminator, decoder)
Esempio n. 7
0
    def fit_model_once(self):
        gcn_model = GAE(encoder=EncoderGCN(
            in_channels=self.dataset.num_features, out_channels=32),
                        decoder=InnerProductDecoder())

        optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.01)

        trainer_gcn = TrainerGae(
            gcn_model,
            self.device,
            self.data,
            writer_path='runs/gae_gcn/{}/'.format(self.feature) +
            self.time_mark())
        gcn_model = trainer_gcn.fit(optimizer,
                                    patience=self.patience,
                                    num_epochs=self.num_epochs)

        auc, ap = trainer_gcn.evaluate(validation=False, test=True)
        return auc, ap
def get_model_and_optimizer(training_method, dataset_name, features_dimension, device):
    training_method_signature = 'BP' if training_method == 'bp' else 'ALT'

    if training_method_signature == 'BP':
        model = GAE(GraphEncoder(features_dimension, 16))
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    else:
        model = GAE(DFAGraphEncoder(features_dimension, 16, training_method=training_method))
        if dataset_name == 'cora':
            optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        elif dataset_name == 'citeseer':
            optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
        elif dataset_name == 'pubmed':
            optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    return model.to(device), optimizer
Esempio n. 9
0
    def __init__(self, data, embed_dim, **kwargs):
        super(UnsGAE, self).__init__()
        self.data = data
        self.input_dim = self.data.dim
        self.embed_dim = embed_dim

        # for now, we only work with 2-layer encoders
        self.hidden_dim = kwargs.get('hidden_dim', 2*embed_dim)
        self.encoder = kwargs.get('encoder', batched_SAGEEncoder)
        self.encoder = self.encoder(self.input_dim,
                                    self.hidden_dim,
                                    self.embed_dim)
        self.model = GAE(self.encoder)

        # preparing the device 
        device = kwargs.get('device', 'cuda')
        if device=='gpu' and not(torch.cuda.is_available()):
            print('CUDA is not available in PyTorch. the model ' +\
                  'will be initiated on CPU.')
            device = 'cpu'
        self.device = torch.device(device)
Esempio n. 10
0
def train(dataset, args, writer = None):
    task = args.task
    test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)

    if task == 'link':
        model = GAE(models.GNNStack(dataset.num_node_features, args.hidden_dim, int(dataset.num_classes), 
                            args))
    elif task == 'node':
        model = models.GNNStack(dataset.num_node_features, args.hidden_dim, int(dataset.num_classes), 
                            args)
    else:
        raise RuntimeError("Unknown task.")
    metrics_for_labels = True if args.metrics_for_labels == 'True' else False
    scheduler, opt = build_optimizer(args, model.parameters())
    print("Training \nModel: {}, Data representation: {}. Dataset: {}, Task type: {}". format(args.model_name, args.graph_type, args.dataset, args.task))
    metric_text = 'test accuracy' if task == 'node' else 'test precision'
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            if task == 'node':
                pred = model(batch)
                label = batch.y
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
                loss = model.loss(pred, label)
            else:
                train_pos_edge_index = batch.train_pos_edge_index
                z = model.encode(batch)
                loss = model.recon_loss(z, train_pos_edge_index)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        if writer == None:
            print(total_loss)
        else:
            writer.add_scalar("loss", total_loss, epoch)

        if epoch % 10 == 0:
            test_metric, _ = test(loader, model, task = task)
            if writer == None:
                print(test_metric, metric_text)
            else:
                writer.add_scalar(metric_text, test_metric, epoch)
        if metrics_for_labels == True and epoch == args.epochs -1:
            _, labels_metrics = test(loader, model, task = task, metrics_for_labels=metrics_for_labels)
            print('{} for labels:\n {}'.format(metric_text, labels_metrics))
Esempio n. 11
0
def load_data(dataset_name):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '.', 'data',
                    dataset_name)

    dataset = Planetoid(path, dataset_name, T.TargetIndegree())
    num_features = dataset.num_features
    data = GAE.split_edges(GAE, dataset[0])

    data.train_pos_edge_index = gutils.to_undirected(data.train_pos_edge_index)
    data.val_pos_edge_index = gutils.to_undirected(data.val_pos_edge_index)
    data.test_pos_edge_index = gutils.to_undirected(data.test_pos_edge_index)

    data.edge_index = torch.cat([
        data.train_pos_edge_index, data.val_pos_edge_index,
        data.test_pos_edge_index
    ],
                                dim=1)

    data.edge_train_mask = torch.cat([
        torch.ones((data.train_pos_edge_index.size(-1))),
        torch.zeros((data.val_pos_edge_index.size(-1))),
        torch.zeros((data.test_pos_edge_index.size(-1)))
    ],
                                     dim=0).byte()
    data.edge_val_mask = torch.cat([
        torch.zeros((data.train_pos_edge_index.size(-1))),
        torch.ones((data.val_pos_edge_index.size(-1))),
        torch.zeros((data.test_pos_edge_index.size(-1)))
    ],
                                   dim=0).byte()
    data.edge_test_mask = torch.cat([
        torch.zeros((data.train_pos_edge_index.size(-1))),
        torch.zeros((data.val_pos_edge_index.size(-1))),
        torch.ones((data.test_pos_edge_index.size(-1)))
    ],
                                    dim=0).byte()

    data.edge_type = torch.zeros(((data.edge_index.size(-1)), )).long()

    data.batch = torch.zeros((1, data.num_nodes), dtype=torch.int64).view(-1)
    data.num_graphs = 1
    return data, num_features
Esempio n. 12
0
data = dataset[0]


class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GAE(Encoder(dataset.num_features, out_channels=16)).to(device)
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = model.split_edges(data)
x, edge_index = data.x.to(device), data.edge_index.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, edge_index)
    loss = model.loss(z, data.train_pos_edge_index, data.train_neg_adj_mask)
    loss.backward()
    optimizer.step()

Esempio n. 13
0
    link_probs = link_logits.sigmoid()
    link_labels = get_link_labels(pos_edge_index, neg_edge_index)
    perfs.append(roc_auc_score(link_labels, link_probs))
    return perfs


# --------------------------------------------------------------------------------------------------------

# --------------------------------------------------------------------------------------------------------
train_loader = NeighborSampler(data.train_pos_edge_index,
                               batch_size=8,
                               shuffle=True,
                               sizes=[5, 5])
# subgraph_loader = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1], batch_size=128, shuffle=False)

enc = Encoder()
decod = DEDICOMDecoder()
model = GAE(enc, decod)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

# ----------------------------------------------------------------------------------------------------------

# ----------------------------------------------------------------------------------------------------------
best_val_perf = test_perf = 0
for epoch in range(1, 3):
    train_loss = train()
    print(epoch, train_loss)
    # val_perf = test()
    # log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}'
    # print(log.format(epoch, train_loss, val_perf[0]))
Esempio n. 14
0
def run_experiment(args):
    """
    Performing experiment for the given arguments
    """
    dataset, data = load_data(args.dataset)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Define Model
    encoder = create_encoder(args.model, dataset.num_features,
                             args.latent_dim).to(device)
    decoder = create_decoder(args.decoder).to(device)

    if args.model == 'GAE':
        model = GAE(encoder=encoder, decoder=decoder).to(device)
    else:
        model = VGAE(encoder=encoder, decoder=decoder).to(device)

    # Split edges of a torch_geometric.data.Data object into pos negative train/val/test edges
    # default ratios of positive edges: val_ratio=0.05, test_ratio=0.1
    print("Data.edge_index.size", data.edge_index.size(1))
    data = model.split_edges(data)
    node_features, train_pos_edge_index = data.x.to(
        device), data.train_pos_edge_index.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    def train_epoch():
        """
        Performing training over a single epoch and optimize over loss
        :return: log - loss of training loss
        """
        # Todo: Add logging of results

        model.train()
        optimizer.zero_grad()
        # Compute latent embedding Z
        latent_embeddings = model.encode(node_features, train_pos_edge_index)

        # Calculate loss and
        loss = model.recon_loss(latent_embeddings, train_pos_edge_index)
        if args.model in ['VGAE']:
            loss = loss + (1 / data.num_nodes) * model.kl_loss()

        # Compute gradients
        loss.backward()
        # Perform optimization step
        optimizer.step()

        # print("Train-Epoch: {} Loss: {}".format(epoch, loss))

        # ToDo: Add logging via Tensorboard
        log = {'loss': loss}

        return log

    def test(pos_edge_index, neg_edge_index):
        model.eval()
        with torch.no_grad():
            # compute latent var
            z = model.encode(node_features, train_pos_edge_index)

        # model.test return - AUC, AP
        return model.test(z, pos_edge_index, neg_edge_index)

    def test_naive_graph(z, sample_size=1000):

        if args.sample_dense_evaluation:
            graph_type = "sampled"
            z_sample, index_mapping = sample_graph(z, sample_size)
            t = time.time()
            adjacency = model.decoder.forward_all(
                z_sample, sigmoid=(args.decoder == 'dot'))
        else:
            graph_type = "full"
            t = time.time()
            adjacency = model.decoder.forward_all(
                z, sigmoid=(args.decoder == 'dot'))

        print(f"Computing {graph_type} graph took {time.time() - t} seconds.")
        print(
            f"Adjacency matrix takes {adjacency.element_size() * adjacency.nelement() / 10 ** 6} MB of memory."
        )

        if args.min_sim_absolute_value is None:
            args.min_sim_absolute_value, _ = sample_percentile(
                args.min_sim,
                adjacency,
                dist_measure=args.decoder,
                sample_size=sample_size)

        if args.sample_dense_evaluation:
            precision, recall = sampled_dense_precision_recall(
                data, adjacency, index_mapping, args.min_sim_absolute_value)
        else:
            precision, recall = dense_precision_recall(
                data, adjacency, args.min_sim_absolute_value)

        print("Predicted {} adjacency matrix has precision {} and recall {}!".
              format(graph_type, precision, recall))

        return precision, recall

    def sample_graph(z, sample_size):
        N, D = z.shape

        sample_size = min(sample_size, N)
        sample_ix = np.random.choice(np.arange(N),
                                     size=sample_size,
                                     replace=False)

        # Returns the sampled embeddings, and a mapping from their indices to the originals
        return z[sample_ix], {i: sample_ix[i] for i in np.arange(sample_size)}

    def test_compare_lsh_naive_graphs(z, assure_correctness=True):
        """

        :param z:
        :param assure_correctness:
        :return:
        """
        # Naive Adjacency-Matrix (Non-LSH-Version)
        t = time.time()
        # Don't use sigmoid in order to directly compare thresholds with LSH
        naive_adjacency = model.decoder.forward_all(
            z, sigmoid=(args.decoder == 'dot'))
        naive_time = time.time() - t
        naive_size = naive_adjacency.element_size() * naive_adjacency.nelement(
        ) / 10**6

        if args.min_sim_absolute_value is None:
            args.min_sim_absolute_value, _ = sample_percentile(
                args.min_sim, z, dist_measure=args.decoder)

        print(
            "______________________________Naive Graph Computation KPI____________________________________________"
        )
        print(f"Computing naive graph took {naive_time} seconds.")
        print(f"Naive adjacency matrix takes {naive_size} MB of memory.")

        # LSH-Adjacency-Matrix:
        t = time.time()
        lsh_adjacency = LSHDecoder(bands=args.lsh_bands,
                                   rows=args.lsh_rows,
                                   verbose=True,
                                   assure_correctness=assure_correctness,
                                   sim_thresh=args.min_sim_absolute_value)(z)
        lsh_time = time.time() - t
        lsh_size = lsh_adjacency.element_size() * lsh_adjacency._nnz() / 10**6

        print(
            "__________________________________LSH Graph Computation KPI__________________________________________"
        )
        print(f"Computing LSH graph took {lsh_time} seconds.")
        print(f"Sparse adjacency matrix takes {lsh_size} MB of memory.")

        print(
            "________________________________________Precision-Recall_____________________________________________"
        )
        # 1) Evaluation: Both Adjacency matrices against ground truth graph
        naive_precision, naive_recall = dense_precision_recall(
            data, naive_adjacency, args.min_sim_absolute_value)

        lsh_precision, lsh_recall = sparse_precision_recall(
            data, lsh_adjacency)

        print(
            f"Naive-Precision {naive_precision}; Naive-Recall {naive_recall}")
        print(f"LSH-Precision {lsh_precision}; LSH-Recall {lsh_recall}")

        print(
            "_____________________________Comparison Sparse vs Dense______________________________________________"
        )
        # 2) Evation: Compare both adjacency matrices against each other
        compare_precision, compare_recall = sparse_v_dense_precision_recall(
            naive_adjacency, lsh_adjacency, args.min_sim_absolute_value)
        print(
            f"LSH sparse matrix has {compare_precision} precision and {compare_recall} recall w.r.t. the naively generated dense matrix!"
        )

        return naive_precision, naive_recall, naive_time, naive_size, lsh_precision, lsh_recall, lsh_time, lsh_size, compare_precision, compare_recall

    # Training routine
    early_stopping = EarlyStopping(args.use_early_stopping,
                                   patience=args.early_stopping_patience,
                                   verbose=True)

    logs = []

    if args.load_model and os.path.isfile("checkpoint.pt"):
        print("Loading model from savefile...")
        model.load_state_dict(torch.load("checkpoint.pt"))

    if not (args.load_model and args.early_stopping_patience == 0):
        for epoch in range(1, args.epochs):
            log = train_epoch()
            logs.append(log)

            # Validation metrics
            val_auc, val_ap = test(data.val_pos_edge_index,
                                   data.val_neg_edge_index)
            print('Validation-Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(
                epoch, val_auc, val_ap))

            # Stop training if validation scores have not improved
            early_stopping(val_ap, model)
            if early_stopping.early_stop:
                print("Applying early-stopping")
                break
    else:
        epoch = 0

    # Load best encoder
    print("Load best model for evaluation.")
    model.load_state_dict(torch.load('checkpoint.pt'))
    print(
        "__________________________________________________________________________"
    )
    # Training is finished, calculate test metrics
    test_auc, test_ap = test(data.test_pos_edge_index,
                             data.test_neg_edge_index)
    print('Test Results: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(
        epoch, test_auc, test_ap))

    # Check if early stopping was applied or not - if not: model might not be done with training
    if args.epochs == epoch + 1:
        print("Model might need more epochs - Increase number of Epochs!")

    # Evaluate full graph
    latent_embeddings = model.encode(node_features, train_pos_edge_index)

    # Save embeddings to embeddings folder if flag is set
    if args.save_embeddings:
        embeddings_folder = osp.join(osp.dirname(osp.abspath(__file__)),
                                     'embeddings')
        if not osp.isdir(embeddings_folder):
            os.makedirs(embeddings_folder)

        torch.save(
            latent_embeddings,
            osp.join(embeddings_folder,
                     args.dataset + "_" + args.decoder + ".pt"))

    if not args.lsh:
        # Compute precision recall w.r.t the ground truth graph
        graph_precision, graph_recall = test_naive_graph(latent_embeddings)
        del model
        del encoder
        del decoder
        torch.cuda.empty_cache()
    else:
        # Precision w.r.t. the generated graph
        naive_precision, naive_recall, naive_time, naive_size, lsh_precision, \
        lsh_recall, lsh_time, lsh_size, \
        compare_precision, compare_recall = test_compare_lsh_naive_graphs(
            latent_embeddings)

        del model
        del encoder
        del decoder
        torch.cuda.empty_cache()

        return {
            'args': args,
            'test_auc': test_auc,
            'test_ap': test_ap,
            'naive_precision': naive_precision,
            'naive_recall': naive_recall,
            'naive_time': naive_time,
            'naive_size': naive_size,
            'lsh_precision': lsh_precision,
            'lsh_recall': lsh_recall,
            'lsh_time': lsh_time,
            'lsh_size': lsh_size,
            'compare_precision': compare_precision,
            'compare_recall': compare_recall
        }
Esempio n. 15
0
def run_model(dataset, conf):
    # ## 1) Build Table graph
    # ### Tables tokenization
    tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus(
        dataset, include_attr=conf["add_attr"])
    if conf["shuffle_vocab"] == True:
        shuffled_vocab = shuffle_vocabulary(vocabulary)
    else:
        shuffled_vocab = None

    nodes = build_node_features(vocabulary)
    row_edges_index, row_edges_weights = build_graph_edges(
        tokenized_tables,
        s_vocab=shuffled_vocab,
        sample_frac=conf["row_edges_sample"],
        columns=False)
    col_edges_index, col_edges_weights = build_graph_edges(
        tokenized_tables,
        s_vocab=shuffled_vocab,
        sample_frac=conf["column_edges_sample"],
        columns=True)

    edges = torch.cat((row_edges_index, col_edges_index), dim=1)
    weights = torch.cat((row_edges_weights, col_edges_weights), dim=0)
    graph_data = Data(x=nodes, edge_index=edges, edge_attr=weights)

    # ## 2 ) Run Table Auto-Encoder Model:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    loader = DataLoader(torch.arange(graph_data.num_nodes),
                        batch_size=128,
                        shuffle=True)
    graph_data = graph_data.to(device)

    x, train_pos_edge_index = nodes, edges

    class Encoder(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super(Encoder, self).__init__()
            self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
            self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True)
            self.conv_logvar = GCNConv(2 * out_channels,
                                       out_channels,
                                       cached=True)

        def forward(self, x, edge_index):
            x = F.relu(self.conv1(x, edge_index))
            return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)

    channels = conf["vector_size"]
    enc = Encoder(graph_data.num_features, channels)
    model = GAE(enc)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    def train(model, optimizer, x, train_pos_edge_index):
        model.train()
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        loss = model.recon_loss(z, train_pos_edge_index)
        #loss = model.kl_loss()

        loss.backward()
        optimizer.step()
        return loss

    losses = []
    for epoch in range(conf["epoch_num"]):
        loss = train(model, optimizer, x, train_pos_edge_index)
        losses.append(loss)
        print(epoch, loss)
        losses.append(loss)
    # ### 3) Extract the latent cell vectors, generate table vectors:
    def get_cell_vectors(model, x, train_pos_edge_index):
        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)
            cell_vectors = z.numpy()
        return z, cell_vectors

    z, cell_vectors = get_cell_vectors(model, x, train_pos_edge_index)

    vec_list = generate_table_vectors(cell_vectors,
                                      tokenized_tables,
                                      s_vocab=shuffled_vocab)

    # ## 3) Evaluate the model
    result_score = evaluate_model(dataset, vec_list, k=5)
    return cell_vectors, vec_list, losses, result_score
Esempio n. 16
0
        self.conv2 = GCNConv(2 * out_channels, out_channels, cached=False)
        # Map into 2*out_channels dimentions with

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)


#This is the size of the latent embedding
channels = 32
# We have 75 origional features
num_features = 75
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#dev = torch.device('cpu')

model = GAE(Encoder(num_features, channels).to(dev))
#data.train_mask = data.val_mask = data.test_mask = data.y = None
#data = model.split_edges(data)
#x, train_edge_index = data.x.to(dev), data.edge_index.to(dev)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)


def train(loader):
    model.train()
    loss_all = 0
    for data in loader:
        x, train_edge_index = data.x.to(dev), data.edge_index.to(dev)
        optimizer.zero_grad()
        z = model.encode(x, train_edge_index)
        loss = model.recon_loss(z, train_edge_index)
        loss.backward()
act = F.relu
sum_res = True
variational = False

path = Path(__file__).parent / "../../test/data/BBA-subset-100.h5"
node_feature_path = (
    Path(__file__).parent / "../../test/data/onehot_bba_amino_acid_labels.npy"
)
dataset = ContactMapDataset(
    path, "contact_map", ["rmsd"], node_feature_path=node_feature_path
)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Select node AE
if args.linear:
    node_ae = GAE(LinearEncoder(num_features, node_out_channels))
else:
    node_ae = GAE(GCNEncoder(num_features, node_out_channels))

# Select graph AE
encoder = VariationalGraphEncoder(
    node_out_channels,
    hidden_channels,
    graph_out_channels,
    depth,
    pool_ratios,
    act,
    variational,
)
decoder = VariationalGraphDecoder(
    graph_out_channels,
Esempio n. 18
0
    Path(__file__).parent / "../../test/data/onehot_bba_amino_acid_labels.npy"
)
dataset = ContactMapDataset(
    path, "contact_map", ["rmsd"], node_feature_path=node_feature_path
)
data = dataset[0]["X"]
loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Parameters
out_channels = 10
num_features = 13

# Model
if not args.variational:
    if not args.linear:
        model = GAE(GCNEncoder(num_features, out_channels))
    else:
        model = GAE(LinearEncoder(num_features, out_channels))
else:
    if args.linear:
        model = VGAE(VariationalLinearEncoder(num_features, out_channels))
    else:
        model = VGAE(VariationalGCNEncoder(num_features, out_channels))

# Hardware
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, data = model.to(device), data.to(device)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
Esempio n. 19
0
    dataset = Planetoid(root='tmp', name='PubMed')
    print("use dataset: PubMed")
data = dataset[0]

enhanced_data = train_test_split_edges(data.clone(),
                                       val_ratio=0.1,
                                       test_ratio=0.2)

train_data = Data(x=enhanced_data.x,
                  edge_index=enhanced_data['train_pos_edge_index']).to(DEVICE)
target_data = data.to(DEVICE)

if args.model is 'VGAE':
    model = VGAE(encoder=VEncoder(data['x'].shape[1])).to(DEVICE)
else:
    model = GAE(encoder=Encoder(data['x'].shape[1])).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(),
                             lr=args.learning_rate,
                             weight_decay=5e-4)


def model_train():
    print("========Start training========")
    for epoch in range(args.num_epoch):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data)
        recon_loss = model.recon_loss(z, target_data['edge_index'])
        if args.model is 'VGAE':
            recon_loss += model.kl_loss() / data['x'].shape[0]
Esempio n. 20
0
    parser.add_argument('--dataset')
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--val-freq', type=int, default=20)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--test', action='store_true', default=False)
    args = parser.parse_args()

    if args.dataset in ['cora', 'citeseer', 'pubmed']:
        path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.',
                            'data', args.dataset)
        data = Planetoid(path, args.dataset)[0]
    else:
        data = load_wiki.load_data()

    data.edge_index = gutils.to_undirected(data.edge_index)
    data = GAE.split_edges(GAE, data)

    num_features = data.x.shape[1]
    aucs = []
    aps = []
    for run in range(args.runs):
        model = VGAE(VGAE_Encoder(num_features))
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        # Training loop
        for epoch in range(args.epochs):
            model.train()
            optimizer.zero_grad()
            z = model.encode(data.x, data.train_pos_edge_index)
            loss = model.recon_loss(
                z, data.train_pos_edge_index)  #0.01*model.kl_loss()
                    help='Residual connection')
args = parser.parse_args()

#download datasets
path = os.join(os.dirname(os.realpath(__file__)), '..', 'data', args.dataset)
dataset = Planetoid(path, args.dataset)

dev = torch.device(args.dev)

if args.model == VGAE:
    model = VGAE(
        Encoder_VGAE(dataset.num_features, args.hidden1, args.hidden2,
                     args.depth, args.res)).to(dev)
else:
    model = GAE(
        Encoder_GAE(dataset.num_features, args.hidden1, args.hidden2,
                    args.depth, args.res)).to(dev)

auc_score_list = []
ap_score_list = []

print("Dataset: ", args.dataset, " Model: ", args.model, ", Residual :",
      args.res, ", Layer depth:", args.depth, " ")

for i in range(1, args.runs + 1):
    data = dataset[0]
    data.train_mask = data.val_mask = data.test_mask = data.y = None
    data = train_test_split_edges(data)

    x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
Esempio n. 22
0
def run_GAE(input_data, output_dir, epochs=1000, lr=0.01, weight_decay=0.0005):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Device: '.ljust(32), device)
    print('Model Name: '.ljust(32), 'GAE')
    print('Model params:{:19} lr: {}   weight_decay: {}'.format(
        '', lr, weight_decay))
    print('Total number of epochs to run: '.ljust(32), epochs)
    print('*' * 70)

    data = input_data.clone().to(device)
    in_channels = data.num_features
    out_channels = data.num_classes.item()
    model = GAE(GAEncoder(in_channels, out_channels)).to(device)
    data = input_data.clone().to(device)
    split_data = model.split_edges(data)
    x, train_pos_edge_index, edge_attr = split_data.x.to(
        device), split_data.train_pos_edge_index.to(device), data.edge_attr.to(
            device)
    split_data.train_idx = split_data.test_idx = data.y = None
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
    train_losses, test_losses = [], []
    aucs = []
    aps = []
    model.train()
    for epoch in range(1, epochs + 1):
        train_loss = 0
        test_loss = 0
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        train_loss = model.recon_loss(z, train_pos_edge_index)
        train_losses.append(train_loss)
        train_loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)
        auc, ap = model.test(z, split_data.test_pos_edge_index,
                             split_data.test_neg_edge_index)
        test_loss = model.recon_loss(z, data.test_pos_edge_index)
        test_losses.append(test_loss.item())
        aucs.append(auc)
        aps.append(ap)

        figname = os.path.join(
            output_dir, "_".join((GAE.__name__, str(lr), str(weight_decay))))
        makepath(output_dir)

        if (epoch % int(epochs / 10) == 0):
            print(
                'Epoch: {}       Train loss: {}    Test loss: {}     AUC: {}    AP: {}'
                .format(epoch, train_loss, test_loss, auc, ap))
        if (epoch == epochs):
            print(
                '-' * 65,
                '\nFinal epoch: {}    Train loss: {}    Test loss: {}    AUC: {}    AP: {}'
                .format(epoch, train_loss, test_loss, auc, ap))
        log = 'Final epoch: {}    Train loss: {}    Test loss: {}    AUC: {}    AP: {}'.format(
            epoch, train_loss, test_loss, auc, ap)
        write_log(log, figname)
    print('-' * 65)

    plot_linkpred(train_losses, test_losses, aucs, aps, output_dir, epochs,
                  figname)
    return
Esempio n. 23
0
        super().__init__()
        self.rel_emb = Parameter(torch.Tensor(num_relations, hidden_channels))
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.rel_emb)

    def forward(self, z, edge_index, edge_type):
        z_src, z_dst = z[edge_index[0]], z[edge_index[1]]
        rel = self.rel_emb[edge_type]
        return torch.sum(z_src * rel * z_dst, dim=1)


model = GAE(
    RGCNEncoder(data.num_nodes, hidden_channels=500,
                num_relations=dataset.num_relations),
    DistMultDecoder(dataset.num_relations // 2, hidden_channels=500),
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def negative_sampling(edge_index, num_nodes):
    # Sample edges by corrupting either the subject or the object of each edge.
    mask_1 = torch.rand(edge_index.size(1)) < 0.5
    mask_2 = ~mask_1

    neg_edge_index = edge_index.clone()
    neg_edge_index[0, mask_1] = torch.randint(num_nodes, (mask_1.sum(), ))
    neg_edge_index[1, mask_2] = torch.randint(num_nodes, (mask_2.sum(), ))
    return neg_edge_index
Esempio n. 24
0
def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        for _, _, adjs in train_loader:
            adjs = [adj.to(dev) for adj in adjs]
            z = model.encode(x, adjs)
    return model.test(z, pos_edge_index, neg_edge_index)


if __name__ == "__main__":

    dataset = RetweetDataset(root='./', transform=T.NormalizeFeatures())
    latent_dim = 32
    dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GAE(Encoder(dataset.num_features, latent_dim)).to(dev)
    data = dataset[0]
    data.train_mask = data.val_mask = data.test_mask = data.y = None
    data = train_test_split_edges(data)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    train_loader = NeighborSampler(data.train_pos_edge_index,
                                   node_idx=None,
                                   sizes=[25, 10],
                                   num_nodes=data.num_nodes,
                                   batch_size=data.x.shape[0],
                                   shuffle=True,
                                   num_workers=1)

    x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)
Esempio n. 25
0
class UnsGAE(object):

    def __init__(self, data, embed_dim, **kwargs):
        super(UnsGAE, self).__init__()
        self.data = data
        self.input_dim = self.data.dim
        self.embed_dim = embed_dim

        # for now, we only work with 2-layer encoders
        self.hidden_dim = kwargs.get('hidden_dim', 2*embed_dim)
        self.encoder = kwargs.get('encoder', batched_SAGEEncoder)
        self.encoder = self.encoder(self.input_dim,
                                    self.hidden_dim,
                                    self.embed_dim)
        self.model = GAE(self.encoder)

        # preparing the device 
        device = kwargs.get('device', 'cuda')
        if device=='gpu' and not(torch.cuda.is_available()):
            print('CUDA is not available in PyTorch. the model ' +\
                  'will be initiated on CPU.')
            device = 'cpu'
        self.device = torch.device(device)

        
    def init_model(self, sizes, weights_path=None):
        self.model = self.model.to(self.device)
        
        # sizes are directly used for initializing the model
        # but it will be used for every feed-forward as the
        # sampling size of the neighbors
        assert len(sizes)==self.model.encoder.num_layers, \
            'Number of sizes should be equal to the number of layers in the encoder.'
        self.sizes = sizes
        if not(hasattr(self.data, 'loader')):
            self.data.get_neighbor_sampler(self.sizes)

        if weights_path is not None:
            self.model.load_state_dict(torch.load(weights_path, map_location=self.device))
            
        
    def init_training(self, neg_num, optim='Adam', lr=1e-5, smooth_par=0.75):
        if optim=='Adam':
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        elif optim=='SGD':
            self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr)
        self.train_one_epoch = self._train_edge_batching
        self.neg_num = neg_num

        if not(hasattr(self.data, 'pos_pairs')):
            assert 'pos_samples_path' in kwargs, 'The provided data does ' +\
                'not come with positive pairs, and we need a path to the ' +\
                'already selected positive samples. You can provide it through ' +\
                'input pos_samples_path .'
            include_nodes = kwargs.get('include_nodes', None)
            self.data.load_positive_pairs(kwargs['pos_samples_path'], include_nodes)
            
        if not(hasattr(self.data, 'neg_sampler')):
            #smooth_par = kwargs.get('smooth_par', 0.75)
            self.data.get_negative_sampler(smooth_par)

        if not(hasattr(self.data, 'x_all')):
            self.data._fetch_node_features()

        
    def init_validation(self):
        if not(hasattr(self.data, 'x_all')):
            self.data._fetch_node_features()
        

    def embed_some(self, sample_inds, b=100):
        """This will be used in the training, when the
        embedding of a batch of samples are needed
        """

        quot, rem = np.divmod(len(sample_inds), b)

        Z = []
        for i in range(quot+1):
            if i<quot:
                b_ids = sample_inds[i*b:(i+1)*b]
            elif rem>0:
                b_ids = sample_inds[i*b:]
                
            # neighbor-sampling for each sample
            _, n_id, adjs = self.data.train_loader.sample(b_ids)
            adjs = [adj.to(self.device) for adj in adjs]

            # get feature vectors through the neighbors sampled above
            batch_X = torch.from_numpy(self.data.get_node_features(n_id))
            batch_X = batch_X.to(torch.float).to(self.device)

            # the encoder's output as the embedding
            try:
                batch_Z = self.model.encoder(batch_X, adjs)
            except:
                pdb.set_trace()
            Z += [batch_Z]

        Z = torch.cat(Z, dim=0)
        return Z

    def embed_all(self):

        L = self.model.encoder.num_layers
        pbar = tqdm(total=self.data.n_x * L, position=0, leave=True)
        pbar.set_description('Evaluating')
        
        self.model.encoder.eval()
        # inference is used in the evaluation stage (not in training) when
        # the embeddings for "all" nodes will be computed. It's written in a way
        # that is faster than the foward-passing function which is mostly used
        # for single batches in the training
        with torch.no_grad():
            for i in range(L):
                xs = []
                for batch_size, n_id, adj in self.data.test_loader:
                    edge_index, _, size = adj.to(self.device)
                    if i==0:
                        x = torch.from_numpy(self.data.get_node_features(n_id))
                        x = x.to(torch.float).to(self.device)
                    else:
                        x = x_all[n_id,:].to(self.device)

                    x_target = x[:size[1]]
                    x = self.model.encoder.convs[i]((x,x_target), edge_index)
                    if i != L-1:
                        x = F.relu(x)

                    xs.append(x[:batch_size,:].cpu())
                    
                    pbar.update(batch_size)

                x_all = torch.cat(xs, dim=0)
                
        pbar.close()
        
        return x_all
    
            
    def _train_edge_batching(self, ep, batch_size=5000):

        assert hasattr(self.data, 'pos_pairs'), 'Positive and negative ' + \
            'samples must be generated before starting the training'
        
        self.model.train()
        neg_num = self.neg_num

        torch.multiprocessing.set_sharing_strategy('file_system')
        pbar = tqdm(total=self.data.pos_pairs.shape[1], position=0, leave=True)
        pbar.set_description(f'Epoch {ep:02d}')
        
        total_loss = 0
        np.random.shuffle(self.data.pos_pairs.T)
        quot, rem = np.divmod(self.data.pos_pairs.shape[1], batch_size)

        for i in range(quot+1):

            # positive mini-batch
            # (#: batch size)
            if i<quot:
                batch_pos_pairs = self.data.pos_pairs[:,i*batch_size:(i+1)*batch_size]
            else:
                batch_pos_pairs = self.data.pos_pairs[:,i*batch_size:]
            batch_pos_samples, pos_edge_index = np.unique(batch_pos_pairs,
                                                          return_inverse=True)
            pos_edge_index = pos_edge_index.reshape(batch_pos_pairs.shape)

            # negative mini-batch
            # (#: batch_size * neg_num)
            batch_neg_samples = self.data.neg_sampler.sample(
                torch.Size([neg_num*batch_pos_pairs.shape[1]]))
            neg_edge_index = np.array([np.repeat(pos_edge_index[0,:],neg_num),
                                       np.arange(pos_edge_index.max()+1,
                                                 pos_edge_index.max()+len(batch_neg_samples)+1)])

            # embeddings of the nodes involved in + and - edges
            self.optimizer.zero_grad()
            unodes = batch_pos_samples.tolist() + batch_neg_samples.tolist()
            Z = self.embed_some(unodes)

            # reconstruction loss
            pos_edge_index = torch.from_numpy(pos_edge_index).to(self.device)
            neg_edge_index = torch.from_numpy(neg_edge_index).to(self.device)
            loss = self.model.recon_loss(Z, pos_edge_index, neg_edge_index)
            loss.backward()
            self.optimizer.step()

            total_loss += float(loss)

            pbar.update(batch_size)

        pbar.close()

        loss = total_loss / (quot+1)
        return loss


    def validate(self):

        self.model.eval()

        Z = self.embed_all()
        ents_Z = Z[:-1,:][self.data.selected_inds[:-1]>=self.data.nA,:].detach().numpy()
        prop_Z = Z[self.data.tags=='prop',:].detach().numpy()
        scores = np.dot(ents_Z, prop_Z.T).squeeze()
        
        sorted_ents = self.data.selected_ents[np.argsort(-scores)]
        unstudied_sorted_ents = np.array([x for x in sorted_ents
                                           if x not in self.data.studied_ents])
        preds = unstudied_sorted_ents[:50]

        prec = np.isin(preds,self.data.GT).sum() / len(preds)

        return prec
Esempio n. 26
0
def perturb_edges(data,
                  name,
                  remove_pct,
                  add_pct,
                  hidden_channels=16,
                  epochs=400):
    if remove_pct == 0 and add_pct == 0:
        return
    try:
        cached = pickle.load(
            open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'rb'))
        print(f'Use cached edge augmentation for dataset {name}')

        if data.setting == 'inductive':
            data.train_edge_index = cached
        else:
            data.edge_index = cached
        return
    except FileNotFoundError:
        try:
            A_pred, adj_orig = pickle.load(
                open(f'{ROOT}/cache/edge/{name}.pt', 'rb'))
            A = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct)
            data.edge_index, _ = from_scipy_sparse_matrix(A)
            pickle.dump(
                data.edge_index,
                open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt',
                     'wb'))
            return
        except FileNotFoundError:
            print(
                f'cache/edge/{name}_{remove_pct}_{add_pct}.pt not found! Regenerating it now'
            )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if data.setting == 'inductive':
        train_data = Data(x=data.train_x,
                          ori_x=data.ori_x,
                          edge_index=data.train_edge_index,
                          y=data.train_y)
    else:
        train_data = deepcopy(data)

    edge_index = deepcopy(train_data.edge_index)
    train_data = train_test_split_edges(train_data,
                                        val_ratio=0.1,
                                        test_ratio=0)
    num_features = train_data.ori_x.shape[1]
    model = GAE(GCNEncoder(num_features, hidden_channels))
    model = model.to(device)
    x = train_data.ori_x.to(device)
    train_pos_edge_index = train_data.train_pos_edge_index.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    best_val_auc = 0
    best_z = None
    for epoch in range(1, epochs + 1):
        model.train()
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        loss = model.recon_loss(z, train_pos_edge_index)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)

        auc, ap = model.test(z, train_data.val_pos_edge_index,
                             train_data.val_neg_edge_index)
        print('Val | Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(
            epoch, auc, ap))
        if auc > best_val_auc:
            best_val_auc = auc
            best_z = deepcopy(z)

    A_pred = torch.sigmoid(torch.mm(z, z.T)).cpu().numpy()

    adj_orig = to_scipy_sparse_matrix(edge_index).asformat('csr')
    adj_pred = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct)

    if data.setting == 'inductive':
        data.train_edge_index, _ = from_scipy_sparse_matrix(adj_pred)
    else:
        data.edge_index, _ = from_scipy_sparse_matrix(adj_pred)

    pickle.dump((A_pred, adj_orig), open(f'{ROOT}/cache/edge/{name}.pt', 'wb'))

    if data.setting == 'inductive':
        pickle.dump(
            data.train_edge_index,
            open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))
    else:
        pickle.dump(
            data.edge_index,
            open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))

class VariationalLinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_mu = GCNConv(in_channels, out_channels)
        self.conv_logstd = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


in_channels, out_channels = dataset.num_features, 16

if not args.variational and not args.linear:
    model = GAE(GCNEncoder(in_channels, out_channels))
elif not args.variational and args.linear:
    model = GAE(LinearEncoder(in_channels, out_channels))
elif args.variational and not args.linear:
    model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
elif args.variational and args.linear:
    model = VGAE(VariationalLinearEncoder(in_channels, out_channels))

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
Esempio n. 28
0
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


if __name__ == "__main__":
    filePath = '../wholeYear/' if len(sys.argv) > 1 else sys.argv[2]
    dataset = WholeYearDataset(filePath)
    d = dataset[0]

    train_test_split_edges(d)

    #parameters
    out_channels = 2
    num_features = d.num_features

    model_gae1 = GAE(GCNEncoder(num_features, out_channels))
    areasUnderCurve_gae_weekday, precisions_gae_weekday, losses_gae_weekday = runAutoencoder(
        model_gae1, d, 1000, torch.optim.Adam, 0.001)
    plotAUC_AP_Loss(areasUnderCurve_gae_weekday, precisions_gae_weekday,
                    losses_gae_weekday, 1000, "GAE 1: 2 Convolutions")

    model2 = GAE(GCNEncoder2(num_features, out_channels))
    areasUnderCurve_gae_weekday_model2, precisions_gae_weekday_model2, losses_gae_weekday_model2 = runAutoencoder(
        model2, d, 1000, torch.optim.Adam, 0.001)
    plotAUC_AP_Loss(areasUnderCurve_gae_weekday_model2,
                    precisions_gae_weekday_model2, losses_gae_weekday_model2,
                    1000, "GAE 2: 2 Convolutions 1 Linear")

    modelVgae = VGAE(VariationalGCNEncoder(num_features, out_channels))
    runVariational1 = runVariational(modelVgae, d, 1000, torch.optim.Adam,
                                     0.001)