Example #1
0
def train_model_and_save_embeddings(dataset, data, epochs, learning_rate,
                                    device):
    # Define Model
    encoder = EmbeddingEncoder(emb_dim=200,
                               out_channels=64,
                               n_nodes=dataset.num_nodes).to(device)

    decoder = CosineSimDecoder().to(device)

    model = VGAE(encoder=encoder, decoder=decoder).to(device)

    node_features, train_pos_edge_index = data.x.to(
        device), data.edge_index.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # data.edge_index = data.edge_index.long()

    assert data.edge_index.max().item() < dataset.num_nodes

    data_loader = NeighborSampler(data,
                                  size=[25, 10],
                                  num_hops=2,
                                  batch_size=10000,
                                  shuffle=False,
                                  add_self_loops=False)

    model.train()

    for epoch in tqdm(range(epochs)):
        epoch_loss = 0.0
        for data_flow in tqdm(data_loader()):
            optimizer.zero_grad()

            data_flow = data_flow.to(device)
            block = data_flow[0]
            embeddings = model.encode(
                node_features[block.n_id], block.edge_index
            )  # TODO Avoid computation of all node features!

            loss = model.recon_loss(embeddings, block.edge_index)
            loss = loss + (1 / len(block.n_id)) * model.kl_loss()

            epoch_loss += loss.item()

            # Compute gradients
            loss.backward()
            # Perform optimization step
            optimizer.step()

        z = model.encode(node_features, train_pos_edge_index)

        torch.save(z.cpu(), "large_emb.pt")

        print(f"Loss after epoch {epoch} / {epochs}: {epoch_loss}")

    return model
Example #2
0
    if args.dataset in ['cora', 'citeseer', 'pubmed']:
        path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.',
                            'data', args.dataset)
        data = Planetoid(path, args.dataset)[0]
    else:
        data = load_wiki.load_data()

    data.edge_index = gutils.to_undirected(data.edge_index)
    data = GAE.split_edges(GAE, data)

    num_features = data.x.shape[1]
    aucs = []
    aps = []
    for run in range(args.runs):
        model = VGAE(VGAE_Encoder(num_features))
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        # Training loop
        for epoch in range(args.epochs):
            model.train()
            optimizer.zero_grad()
            z = model.encode(data.x, data.train_pos_edge_index)
            loss = model.recon_loss(
                z, data.train_pos_edge_index)  #0.01*model.kl_loss()
            loss.backward()
            optimizer.step()

            # Log validation metrics
            if epoch % args.val_freq == 0:
                model.eval()
                with torch.no_grad():
Example #3
0
def main():
    model_name = 'VGAE'
    disease_gene_files = [
        'data/OMIM/3-fold-1.txt', 'data/OMIM/3-fold-2.txt',
        'data/OMIM/3-fold-3.txt'
    ]
    disease_disease_file = 'data/MimMiner/MimMiner.txt'
    gene_gene_file = 'data/HumanNetV2/HumanNet_V2.txt'
    prediction_files = [
        f'data/prediction/{model_name}/prediction-3-fold-1.txt',
        f'data/prediction/{model_name}/prediction-3-fold-2.txt',
        f'data/prediction/{model_name}/prediction-3-fold-3.txt'
    ]

    for counter in [3]:
        g_nx = nx.Graph()
        with open(disease_gene_files[counter], 'r') as f:
            for line in f:
                node1, node2, tag = line.strip().split('\t')
                if tag == 'train':
                    g_nx.add_node(node1)
                    g_nx.add_node(node2)
                    g_nx.add_edge(node1, node2, weight=1)
        with open(gene_gene_file, 'r') as f:
            for line in f:
                node1, node2 = line.strip().split('\t')
                g_nx.add_node(node1)
                g_nx.add_node(node2)
                g_nx.add_edge(node1, node2, weight=1)
        with open(disease_disease_file, 'r') as f:
            for line in f:
                node1, node2, weight = line.strip().split('\t')
                g_nx.add_node(node1)
                g_nx.add_node(node2)
                g_nx.add_edge(node1, node2, weight=1)
        print('read data success')

        name_id = dict(zip(g_nx.nodes(), range(g_nx.number_of_nodes())))
        g_nx = nx.relabel_nodes(g_nx, name_id)

        # transform from networkx to pyg data
        g_nx = g_nx.to_directed() if not nx.is_directed(g_nx) else g_nx
        edge_index = torch.tensor(list(g_nx.edges)).t().contiguous()
        data = {}
        data['edge_index'] = edge_index.view(2, -1)
        data = torch_geometric.data.Data.from_dict(data)
        data.num_nodes = g_nx.number_of_nodes()
        data.x = torch.from_numpy(np.eye(data.num_nodes)).float()
        data.train_mask = data.val_mask = data.test_mask = data.y = None
        print(
            f'Graph information:\nNode:{data.num_nodes}\nEdge:{data.num_edges}\nFeature:{data.num_node_features}'
        )

        channels = 128
        dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = VGAE(Encoder(data.num_node_features, channels)).to(dev)
        x, train_pos_edge_index = data.x.to(dev), data.edge_index.to(dev)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        for epoch in range(4000):
            model.train()
            optimizer.zero_grad()
            z = model.encode(x, train_pos_edge_index)
            loss = model.recon_loss(
                z,
                train_pos_edge_index) + (1 / data.num_nodes) * model.kl_loss()
            loss.backward()
            optimizer.step()
            nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print(f'{nowTime}\tepoch:{epoch}\tloss:{loss}')

        z = model.encode(x, train_pos_edge_index)
        pred = model.decoder.forward_all(z).cpu().detach().numpy().tolist()

        id_name = {}
        diseases = set()
        genes = set()
        for key in name_id:
            id_name[name_id[key]] = key
            if key.startswith('g_'):
                genes.add(key)
            elif key.startswith('d_'):
                diseases.add(key)

        test_diseases = set()
        with open(disease_gene_files[counter], 'r') as f:
            for line in f:
                disease, gene, tag = line.strip().split('\t')
                if tag == 'test':
                    test_diseases.add(disease)

        with open(prediction_files[counter], 'w') as f:
            for disease in test_diseases:
                sims = {}
                if disease not in diseases:
                    for gene in genes:
                        sims[gene] = 0
                else:
                    for gene in genes:
                        sim = pred[name_id[disease]][name_id[gene]]
                        sims[gene] = sim
                sorted_sims = sorted(sims.items(),
                                     key=lambda item: item[1],
                                     reverse=True)
                c = 0
                for gene, sim in sorted_sims:
                    f.write(disease + '\t' + gene + '\t' + str(sim) + '\n')
                    c += 1
                    if c >= 150:
                        break
        Encoder_GAE(dataset.num_features, args.hidden1, args.hidden2,
                    args.depth, args.res)).to(dev)

auc_score_list = []
ap_score_list = []

print("Dataset: ", args.dataset, " Model: ", args.model, ", Residual :",
      args.res, ", Layer depth:", args.depth, " ")

for i in range(1, args.runs + 1):
    data = dataset[0]
    data.train_mask = data.val_mask = data.test_mask = data.y = None
    data = train_test_split_edges(data)

    x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    adj_train = train_pos_edge_index
    adj_train_dense = to_dense_adj(adj_train)[0]
    adj_train_dense = adj_train_dense

    norm = adj_train_dense.shape[0] * adj_train_dense.shape[0] / float(
        (adj_train_dense.shape[0] * adj_train_dense.shape[0] -
         adj_train_dense.sum()) * 2)

    z_final = None
    for epoch in range(1, 100):

        model.train()
        optimizer.zero_grad()
Example #5
0
def run_VGAE(input_data,
             output_dir,
             epochs=1000,
             lr=0.01,
             weight_decay=0.0005):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Device: '.ljust(32), device)
    print('Model Name: '.ljust(32), 'VGAE')
    print('Model params:{:19} lr: {}     weight_decay: {}'.format(
        '', lr, weight_decay))
    print('Total number of epochs to run: '.ljust(32), epochs)
    print('*' * 70)

    data = input_data.clone().to(device)
    model = VGAE(VGAEncoder(data.num_features,
                            data.num_classes.item())).to(device)
    data = model.split_edges(data)
    x, train_pos_edge_index, edge_attr = data.x.to(
        device), data.train_pos_edge_index.to(device), data.edge_attr.to(
            device)
    data.train_idx = data.test_idx = data.y = None
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    train_losses = []
    test_losses = []
    aucs = []
    aps = []
    model.train()
    for epoch in range(1, epochs + 1):
        train_loss, test_loss = 0, 0
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        train_loss = model.recon_loss(
            z, train_pos_edge_index) + (1 / data.num_nodes) * model.kl_loss()
        train_losses.append(train_loss.item())
        train_loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)
        auc, ap = model.test(z, data.test_pos_edge_index,
                             data.test_neg_edge_index)
        test_loss = model.recon_loss(
            z,
            data.test_pos_edge_index) + (1 / data.num_nodes) * model.kl_loss()
        test_losses.append(test_loss.item())
        aucs.append(auc)
        aps.append(ap)
        makepath(output_dir)
        figname = os.path.join(
            output_dir, "_".join(
                (VGAE.__name__, str(lr), str(weight_decay), str(epochs))))
        # print('AUC: {:.4f}, AP: {:.4f}'.format(auc, ap))
        if (epoch % int(epochs / 10) == 0):
            print(
                'Epoch: {}        Train loss: {}    Test loss: {}    AUC: {}    AP: {:.4f}'
                .format(epoch, train_loss, test_loss, auc, ap))
        if (epoch == epochs):
            print(
                '-' * 65,
                '\nFinal epoch: {}  Train loss: {}    Test loss: {}    AUC: {}    AP: {}'
                .format(epoch, train_loss, test_loss, auc, ap))
        log = 'Final epoch: {}    Train loss: {}    Test loss: {}    AUC: {}    AP: {}'.format(
            epoch, train_loss, test_loss, auc, ap)
        write_log(log, figname)
    print('-' * 65)

    plot_linkpred(train_losses, test_losses, aucs, aps, output_dir, epochs,
                  figname)
    return
Example #6
0
def run_model(dataset, conf):
    # ## 1) Build Table graph
    # ### Tables tokenization
    tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus(
        dataset, include_attr=conf["add_attr"])
    if conf["shuffle_vocab"] == True:
        shuffled_vocab = shuffle_vocabulary(vocabulary)
    else:
        shuffled_vocab = None

    nodes = build_node_features(vocabulary)
    row_edges_index, row_edges_weights = build_graph_edges(
        tokenized_tables,
        s_vocab=shuffled_vocab,
        sample_frac=conf["row_edges_sample"],
        columns=False)
    col_edges_index, col_edges_weights = build_graph_edges(
        tokenized_tables,
        s_vocab=shuffled_vocab,
        sample_frac=conf["column_edges_sample"],
        columns=True)

    edges = torch.cat((row_edges_index, col_edges_index), dim=1)
    weights = torch.cat((row_edges_weights, col_edges_weights), dim=0)
    graph_data = Data(x=nodes, edge_index=edges, edge_attr=weights)

    # ## 2 ) Run Table Auto-Encoder Model:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    loader = DataLoader(torch.arange(graph_data.num_nodes),
                        batch_size=128,
                        shuffle=True)
    graph_data = graph_data.to(device)

    x, train_pos_edge_index = nodes, edges

    class Encoder(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super(Encoder, self).__init__()
            self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
            self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True)
            self.conv_logvar = GCNConv(2 * out_channels,
                                       out_channels,
                                       cached=True)

        def forward(self, x, edge_index):
            x = F.relu(self.conv1(x, edge_index))
            return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)

    channels = conf["vector_size"]
    enc = Encoder(graph_data.num_features, channels)
    model = VGAE(enc)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    def train(model, optimizer, x, train_pos_edge_index):
        model.train()
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        rl = model.recon_loss(z, train_pos_edge_index)
        kl = model.kl_loss()

        loss = rl + kl

        loss.backward()
        optimizer.step()
        return (rl, kl, loss)

    losses = []
    for epoch in range(conf["epoch_num"]):
        loss = train(model, optimizer, x, train_pos_edge_index)
        losses.append(loss)
        print(epoch, loss)
        losses.append(loss)
    # ### 3) Extract the latent cell vectors, generate table vectors:
    def get_cell_vectors(model, x, train_pos_edge_index):
        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)
            cell_vectors = z.numpy()
        return z, cell_vectors

    z, cell_vectors = get_cell_vectors(model, x, train_pos_edge_index)

    vec_list = generate_table_vectors(cell_vectors,
                                      tokenized_tables,
                                      s_vocab=shuffled_vocab)

    # ## 3) Evaluate the model
    result_score = evaluate_model(dataset, vec_list, k=5)
    return cell_vectors, vec_list, losses, result_score