Example #1
0
    def _process(self):

        # Reload train, val and test .pt files
        self.train = PPI(root=self.raw_dir, split='train')
        self.validation = PPI(root=self.raw_dir, split='val')
        self.test = PPI(root=self.raw_dir, split='test')

        # dynamically set maximum num nodes (useful if using dense batching, e.g. diffpool)
        max_num_nodes = max([g.x.shape[0] for data in [self.train, self.validation, self.test] for g in data])
        setattr(self, 'max_num_nodes', max_num_nodes)

        # 11-th feature is a constant, let's remove it
        idx_to_remove = 10
        mask = torch.LongTensor(list(range(idx_to_remove)) + list(range(idx_to_remove+1, 50)) )

        # Convert PyG Data object into our augmented Data object
        dataset = [Data(x=g.x.index_select(1, mask), y=g.y, edge_index=g.edge_index) for data in [self.train, self.validation, self.test] for g in data]

        '''
        Used to debug feature filtering
        for g in dataset:
            print(g.x.shape)
            print(g.x.index_select(1, mask).shape)
        '''

        torch.save(dataset, self.processed_dir / f"{self.name}.pt")
Example #2
0
def main(args):
    print('-----------dense gsdnef ppi alpha %s-----------' % (args.alpha))
    ## loading data
    dataset = args.input
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI')
    train_dataset = PPI(path, split='train')
    val_dataset = PPI(path, split='val')
    test_dataset = PPI(path, split='test')
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = Net(in_channels=train_dataset.num_features,
                out_channels=train_dataset.num_classes,
                hidden_num=args.hidden_num,
                alpha=args.alpha,
                K=args.K).to(device)
    loss_op = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    t1 = time.time()

    for epoch in range(1, args.epochs + 1):
        loss = train(model, optimizer, train_loader, loss_op, device)
        val_f1 = test(model, val_loader, device)
        test_f1 = test(model, test_loader, device)
        print('Epoch: {:02d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'.format(
            epoch, loss, val_f1, test_f1))
    print('{:.4f}'.format(test_f1))
    t2 = time.time()
def load_data():
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI')
    train_dataset = PPI(path, split='train')
    val_dataset = PPI(path, split='val')
    test_dataset = PPI(path, split='test')
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    return train_loader, val_loader, test_loader
Example #4
0
def load_ppi_data():
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                        "data", "PPI")
    train_dataset = PPI(path, split="train", transform=T.NormalizeFeatures())
    val_dataset = PPI(path, split="val", transform=T.NormalizeFeatures())
    test_dataset = PPI(path, split="test", transform=T.NormalizeFeatures())
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    return [train_loader, val_loader, test_loader]
Example #5
0
 def __init__(self, name):
     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                     name)
     self.path = path
     self.train_dataset = PPI(self.path,
                              split='train',
                              transform=T.NormalizeFeatures())
     self.test_dataset = PPI(self.path,
                             split='test',
                             transform=T.NormalizeFeatures())
     self.num_features = self.train_dataset.num_features
     self.reconstruction_loss = None
Example #6
0
def get_ppi_dataset(name):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets',
                    'node_datasets', name)
    train_dataset = PPI(path, split='train')
    val_dataset = PPI(path, split='val')
    test_dataset = PPI(path, split='test')

    dataset = {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }
    return dataset
def loadPPI():
    from torch_geometric.datasets import PPI
    ppi = PPI(root="data/ppi/")
    ppitrain = ppi
    ppi += PPI(root="data/ppi/", split="val")
    ppi += PPI(root="data/ppi/", split="test")
    Y = []
    for ppii in ppi:
        Y.append(ppii.y)
    Y = torch.cat(Y, dim=0)
    labels = Y
    goodclass = selectclass(labels)
    # print(labels.size())
    labels = labels[:, goodclass]
    print(labels[:10, :])
    print("=" * 20)
    selectclass(labels)  ## check
    # print(ppi.data.x[:10,:])[]
    # for i in range()

    for i in range(len(ppi)):
        print("ppi{}".format(i))
        ppii = ppi[i]
        N = ppii.x.size(0)
        edgelist = ppii.edge_index.transpose(0, 1).numpy().tolist()
        with open("data/ppi/ppi" + str(i) + ".edgelist", 'w') as file:
            file.write(str(N) + "\n")
            for edge in edgelist:
                file.write(str(edge[0]) + " " + str(edge[1]) + "\n")
        labeli = ppii.y[:, goodclass].numpy()
        with open("data/ppi/ppi" + str(i) + ".y.pkl", 'wb') as file:
            pkl.dump(labeli, file)
        with open("data/ppi/ppi" + str(i) + ".x.pkl", 'wb') as file:
            xi = ppii.x.numpy()
            pkl.dump(xi, file)

    i = 24
    print("ppitrain {}".format(i))
    ppii = ppitrain.data
    N = ppii.x.size(0)
    edgelist = ppii.edge_index.transpose(0, 1).numpy().tolist()
    with open("data/ppi/ppi" + str(i) + ".edgelist", 'w') as file:
        file.write(str(N) + "\n")
        for edge in edgelist:
            file.write(str(edge[0]) + " " + str(edge[1]) + "\n")
    labeli = ppii.y[:, goodclass].numpy()
    with open("data/ppi/ppi" + str(i) + ".y.pkl", 'wb') as file:
        pkl.dump(labeli, file)
    with open("data/ppi/ppi" + str(i) + ".x.pkl", 'wb') as file:
        xi = ppii.x.numpy()
        pkl.dump(xi, file)
Example #8
0
def load_PPI(dataset_folder):
    """
    导入PPI数据集,处理为Data格式并划分训练集,验证集,测试集
    :param dataset_folder: 数据集存储路径
    :return: 训练集,验证集,测试集
    """
    path = os.path.join(os.path.dirname(dataset_folder), 'PPI')
    train_dataset = PPI(path, split='train')
    val_dataset = PPI(path, split='val')
    test_dataset = PPI(path, split='test')
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)
    return train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader
Example #9
0
def load_ppi(dataset):
    data_name = ['PPI']
    assert dataset in data_name
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'Datasets',
                    'NodeData', 'PPI')
    dataset = PPI(path)
    return dataset
def main():
    if not os.path.exists("plots/cluster"):
        os.makedirs("plots/cluster")

    parser = argparse.ArgumentParser(description='Decoder arguments')
    parse_encoder(parser)
    parse_decoder(parser)
    args = parser.parse_args()
    args.dataset = "enzymes"

    print("Using dataset {}".format(args.dataset))
    if args.dataset == 'enzymes':
        dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
        task = 'graph'
    elif args.dataset == 'cox2':
        dataset = TUDataset(root='/tmp/cox2', name='COX2')
        task = 'graph'
    elif args.dataset == 'reddit-binary':
        dataset = TUDataset(root='/tmp/REDDIT-BINARY', name='REDDIT-BINARY')
        task = 'graph'
    elif args.dataset == 'dblp':
        dataset = TUDataset(root='/tmp/dblp', name='DBLP_v1')
        task = 'graph-truncate'
    elif args.dataset == 'coil':
        dataset = TUDataset(root='/tmp/coil', name='COIL-DEL')
        task = 'graph'
    elif args.dataset.startswith('roadnet-'):
        graph = nx.Graph()
        with open("data/{}.txt".format(args.dataset), "r") as f:
            for row in f:
                if not row.startswith("#"):
                    a, b = row.split("\t")
                    graph.add_edge(int(a), int(b))
        dataset = [graph]
        task = 'graph'
    elif args.dataset == "ppi":
        dataset = PPI(root="/tmp/PPI")
        task = 'graph'
    elif args.dataset in ['diseasome', 'usroads', 'mn-roads', 'infect']:
        fn = {
            "diseasome": "bio-diseasome.mtx",
            "usroads": "road-usroads.mtx",
            "mn-roads": "mn-roads.mtx",
            "infect": "infect-dublin.edges"
        }
        graph = nx.Graph()
        with open("data/{}".format(fn[args.dataset]), "r") as f:
            for line in f:
                if not line.strip(): continue
                a, b = line.strip().split(" ")
                graph.add_edge(int(a), int(b))
        dataset = [graph]
        task = 'graph'
    elif args.dataset.startswith('plant-'):
        size = int(args.dataset.split("-")[-1])
        dataset = make_plant_dataset(size)
        task = 'graph'

    pattern_growth(dataset, task, args)
Example #11
0
def load_data(path):
    train_dataset = PPI(path, split='train')
    val_dataset = PPI(path, split='val')
    test_dataset = PPI(path, split='test')
    train_loader = DataLoader(train_dataset,
                              batch_size=1,
                              shuffle=True,
                              num_workers=0)
    val_loader = DataLoader(val_dataset,
                            batch_size=2,
                            shuffle=False,
                            num_workers=0)
    test_loader = DataLoader(test_dataset,
                             batch_size=2,
                             shuffle=False,
                             num_workers=0)

    return train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader
Example #12
0
def test_PPI(dataset_folder):
    """
        导入PPI数据集,处理为Data格式
        :param dataset_folder: 数据集存储路径
        :return: 测试集
        """
    path = os.path.join(os.path.dirname(dataset_folder), 'PPI')
    test_dataset = PPI(path, split='test')
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    return test_dataset, test_loader
Example #13
0
    def _process(self):

        # Reload train, val and test .pt files
        self.train = PPI(root=self.raw_dir, split='train')
        self.validation = PPI(root=self.raw_dir, split='val')
        self.test = PPI(root=self.raw_dir, split='test')

        # dynamically set maximum num nodes (useful if using dense batching, e.g. diffpool)
        max_num_nodes = max([
            g.x.shape[0] for data in [self.train, self.validation, self.test]
            for g in data
        ])
        setattr(self, 'max_num_nodes', max_num_nodes)

        # 11-th feature is a constant, let's remove it
        idx_to_remove = 10
        mask = torch.LongTensor(
            list(range(idx_to_remove)) + list(range(idx_to_remove + 1, 50)))

        # Convert PyG Data object into our augmented Data object
        dataset = [
            Data(x=g.x.index_select(1, mask), y=g.y, edge_index=g.edge_index)
            for data in [self.train, self.validation, self.test] for g in data
        ]
        '''
        Used to debug feature filtering
        for g in dataset:
            print(g.x.shape)
            print(g.x.index_select(1, mask).shape)
        '''

        # TODO COME FARLO SU PPI SE GIA' HO I DATA? (MINOR)
        #    if self.precompute_kron_indices:
        #    laplacians, v_plus_list = self._precompute_kron_indices(G)
        #       G.laplacians = laplacians
        #       G.v_plus = v_plus_list
        #
        #   if G.number_of_nodes() > 1 and G.number_of_edges() > 0:
        #       data = self._to_data(G) # TODO questo c'e' già!!
        #       dataset.append(data)

        torch.save(dataset, self.processed_dir / f"{self.name}.pt")
Example #14
0
 def __init__(self, path: str):
     train_dataset = PPI(os.path.join(path, '_pyg'), 'train')
     if hasattr(train_dataset, "__data_list__"):
         delattr(train_dataset, "__data_list__")
     if hasattr(train_dataset, "_data_list"):
         delattr(train_dataset, "_data_list")
     val_dataset = PPI(os.path.join(path, '_pyg'), 'val')
     if hasattr(val_dataset, "__data_list__"):
         delattr(val_dataset, "__data_list__")
     if hasattr(val_dataset, "_data_list"):
         delattr(val_dataset, "_data_list")
     test_dataset = PPI(os.path.join(path, '_pyg'), 'test')
     if hasattr(test_dataset, "__data_list__"):
         delattr(test_dataset, "__data_list__")
     if hasattr(test_dataset, "_data_list"):
         delattr(test_dataset, "_data_list")
     train_index = range(len(train_dataset))
     val_index = range(len(train_dataset),
                       len(train_dataset) + len(val_dataset))
     test_index = range(
         len(train_dataset) + len(val_dataset),
         len(train_dataset) + len(val_dataset) + len(test_dataset))
     super(PPIDataset, self).__init__([
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(
             {
                 'x': data.x,
                 'y': data.y
             }, data.edge_index) for data in train_dataset
     ] + [
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(
             {
                 'x': data.x,
                 'y': data.y
             }, data.edge_index) for data in val_dataset
     ] + [
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(
             {
                 'x': data.x,
                 'y': data.y
             }, data.edge_index) for data in test_dataset
     ], train_index, val_index, test_index)
Example #15
0
def load_dataset(name):
    name = name.lower()

    if name in ['cora', 'citeseer', 'pubmed']:
        return Planetoid(root=name, name=name, pre_transform=normalize_features)
    elif name == 'ppi':
        datasets = []
        for split in ['train', 'val', 'test']:
            dataset = PPI(root='PPI', split=split, pre_transform=normalize_features)

            datasets.append(dataset)
            
        return datasets
Example #16
0
def get_dataset(dataset_name):
    """
    Retrieves the dataset corresponding to the given name.
    """
    print("Getting dataset...")
    path = join('dataset', dataset_name)
    if dataset_name == 'reddit':
        dataset = Reddit(path)
    elif dataset_name == 'ppi':
        dataset = PPI(path)
    elif dataset_name == 'github':
        dataset = GitHub(path)
        data = dataset.data
        idx_train, idx_test = train_test_split(list(range(data.x.shape[0])),
                                               test_size=0.4,
                                               random_state=42)
        idx_val, idx_test = train_test_split(idx_test,
                                             test_size=0.5,
                                             random_state=42)
        data.train_mask = torch.tensor(idx_train)
        data.val_mask = torch.tensor(idx_val)
        data.test_mask = torch.tensor(idx_test)
        dataset.data = data
    elif dataset_name in ['amazon_comp', 'amazon_photo']:
        dataset = Amazon(path, "Computers", T.NormalizeFeatures()
                         ) if dataset_name == 'amazon_comp' else Amazon(
                             path, "Photo", T.NormalizeFeatures())
        data = dataset.data
        idx_train, idx_test = train_test_split(list(range(data.x.shape[0])),
                                               test_size=0.4,
                                               random_state=42)
        idx_val, idx_test = train_test_split(idx_test,
                                             test_size=0.5,
                                             random_state=42)
        data.train_mask = torch.tensor(idx_train)
        data.val_mask = torch.tensor(idx_val)
        data.test_mask = torch.tensor(idx_test)
        dataset.data = data
    elif dataset_name in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(path,
                            name=dataset_name,
                            split="full",
                            transform=T.NormalizeFeatures())
    else:
        raise NotImplementedError

    print("Dataset ready!")
    return dataset
Example #17
0
def load_pyg(name, dataset_dir):
    """
    Load PyG dataset objects. (More PyG datasets will be supported)

    Args:
        name (string): dataset name
        dataset_dir (string): data directory

    Returns: PyG dataset object

    """
    dataset_dir = '{}/{}'.format(dataset_dir, name)
    if name in ['Cora', 'CiteSeer', 'PubMed']:
        dataset = Planetoid(dataset_dir, name)
    elif name[:3] == 'TU_':
        # TU_IMDB doesn't have node features
        if name[3:] == 'IMDB':
            name = 'IMDB-MULTI'
            dataset = TUDataset(dataset_dir, name, transform=T.Constant())
        else:
            dataset = TUDataset(dataset_dir, name[3:])
    elif name == 'Karate':
        dataset = KarateClub()
    elif 'Coauthor' in name:
        if 'CS' in name:
            dataset = Coauthor(dataset_dir, name='CS')
        else:
            dataset = Coauthor(dataset_dir, name='Physics')
    elif 'Amazon' in name:
        if 'Computers' in name:
            dataset = Amazon(dataset_dir, name='Computers')
        else:
            dataset = Amazon(dataset_dir, name='Photo')
    elif name == 'MNIST':
        dataset = MNISTSuperpixels(dataset_dir)
    elif name == 'PPI':
        dataset = PPI(dataset_dir)
    elif name == 'QM7b':
        dataset = QM7b(dataset_dir)
    else:
        raise ValueError('{} not support'.format(name))

    return dataset
Example #18
0
def load_dataset(name):
    """ Load real-world datasets, available in PyTorch Geometric.

    Used as a helper for DiskDataSource.
    """
    task = "graph"
    if name == "enzymes":
        dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
    elif name == "proteins":
        dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS")
    elif name == "cox2":
        dataset = TUDataset(root="/tmp/cox2", name="COX2")
    elif name == "aids":
        dataset = TUDataset(root="/tmp/AIDS", name="AIDS")
    elif name == "reddit-binary":
        dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY")
    elif name == "imdb-binary":
        dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY")
    elif name == "firstmm_db":
        dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB")
    elif name == "dblp":
        dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1")
    elif name == "ppi":
        dataset = PPI(root="/tmp/PPI")
    elif name == "qm9":
        dataset = QM9(root="/tmp/QM9")
    elif name == "atlas":
        dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)]
    if task == "graph":
        train_len = int(0.8 * len(dataset))
        train, test = [], []
        dataset = list(dataset)
        random.shuffle(dataset)
        has_name = hasattr(dataset[0], "name")
        for i, graph in tqdm(enumerate(dataset)):
            if not type(graph) == nx.Graph:
                if has_name: del graph.name
                graph = pyg_utils.to_networkx(graph).to_undirected()
            if i < train_len:
                train.append(graph)
            else:
                test.append(graph)
    return train, test, task
Example #19
0
def ppi_prepoc(dirname, seed):
    # 20 protein graphs - some set as validation, some as train, some as test.
    # Need to create the relevant masks for each graph
    data = SimpleNamespace()
    data.graphs = []
    for split in ['train', 'val', 'test']:
        split_data = PPI(root=dirname,
                         split=split,
                         pre_transform=T.NormalizeFeatures())
        x_idxs = split_data.slices['x'].numpy()
        edge_idxs = split_data.slices['edge_index'].numpy()
        split_data = split_data.data
        for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:],
                                                  edge_idxs, edge_idxs[1:]):
            graph = Data(split_data.x[x_start:x_end],
                         split_data.edge_index[:, e_start:e_end],
                         y=split_data.y[x_start:x_end])
            graph.num_nodes = int(x_end - x_start)
            graph.split = split
            all_true = torch.ones(graph.num_nodes).bool()
            all_false = torch.zeros(graph.num_nodes).bool()
            graph.train_mask = all_true if split == 'train' else all_false
            graph.val_mask = all_true if split == 'val' else all_false
            graph.test_mask = all_true if split == 'test' else all_false
            data.graphs.append(graph)
    if seed != 0:
        temp_random = random.Random(seed)
        val_graphs = temp_random.sample(range(len(data.graphs)), 2)
        test_candidates = [
            graph_idx for graph_idx in range(len(data.graphs))
            if graph_idx not in val_graphs
        ]
        test_graphs = temp_random.sample(test_candidates, 2)
        for graph_idx, graph in enumerate(data.graphs):
            all_true = torch.ones(graph.num_nodes).bool()
            all_false = torch.zeros(graph.num_nodes).bool()
            graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train'
            graph.train_mask = all_true if graph.split == 'train' else all_false
            graph.val_mask = all_true if graph.split == 'val' else all_false
            graph.test_mask = all_true if graph.split == 'test' else all_false

    return data
Example #20
0
import os.path as osp
import time
import torch
import torch.nn.functional as F
from torch_geometric.datasets import PPI
from torch_geometric.data import DataLoader
from torch_geometric.nn import GATConv
from sklearn.metrics import f1_score
import pandas as pd

path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI')
train_dataset = PPI(path, split='train')
val_dataset = PPI(path, split='test')
test_dataset = PPI(path, split='test')
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)


class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GATConv(train_dataset.num_features, 256, heads=4)
        self.lin1 = torch.nn.Linear(train_dataset.num_features, 4 * 256)
        self.conv2 = GATConv(4 * 256, 256, heads=4)
        self.lin2 = torch.nn.Linear(4 * 256, 4 * 256)
        self.conv3 = GATConv(
            4 * 256, train_dataset.num_classes, heads=6, concat=False)
        self.lin3 = torch.nn.Linear(4 * 256, train_dataset.num_classes)
        #self.lin3 = torch.nn.Linear(4 * 256, 200)
import numpy as np
import torch.nn as nn
import os.path as osp
import torch.nn.functional as F
from sklearn.metrics import f1_score 
import torch_geometric.transforms as T
from torch_geometric.data import Batch
from torch_geometric.nn import ChebConv
from torch_geometric.datasets import PPI
from torch_geometric.datasets import Planetoid
from torch_geometric.data import ClusterData, ClusterLoader
from torch_geometric.data import DataLoader, Data

#Data 
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI')
train_dataset = PPI(path, split='train')    #20graphs
val_dataset = PPI(path, split='val')        #2graphs
test_dataset = PPI(path, split='test')      #2graphs
dataset = PPI(path)


#Data to Loader
train_data_list = [data for data in train_dataset]
for data in train_data_list:
    data.train_mask = torch.ones(data.num_nodes, dtype=torch.bool)
    data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    
val_data_list = [data for data in val_dataset]
for data in val_data_list:
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
Example #22
0
def main_ppi(type):
    """
	:arg type: 'GCN' or 'GAT'
	"""
    # Import PPI dataset
    path = osp.join(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'data',
        'PPI')
    train_dataset = PPI(path, split='train')
    val_dataset = PPI(path, split='val')
    test_dataset = PPI(path, split='test')
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

    # Define training function
    def train():
        model.train()
        total_loss = 0
        for data in train_loader:
            num_graphs = data.num_graphs
            data.batch = None
            data = data.to(device)
            optimizer.zero_grad()
            loss = loss_op(model(data.x, data.edge_index), data.y)
            total_loss += loss.item() * num_graphs
            loss.backward()
            optimizer.step()
        return total_loss / len(train_loader.dataset)

    # Define testing pipeline
    def test(loader):
        model.eval()
        ys, preds = [], []
        for data in loader:
            ys.append(data.y)
            with torch.no_grad():
                out = model(data.x.to(device), data.edge_index.to(device))
            preds.append((out > 0).float().cpu())

        y, pred = torch.cat(ys, dim=0).numpy(), torch.cat(preds, dim=0).numpy()
        return f1_score(y, pred, average='micro') if pred.sum() > 0 else 0

    # Train GAT or GCN model on PPI dataset
    device = 'cpu'  #torch.device('cuda' if torch.cuda.is_available() else
    if type == 'GAT':
        model = Net(train_dataset).to(device)
        epochs = 20
    else:
        model = Net_GCN(train_dataset).to(device)
        epochs = 100
    loss_op = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

    # Use above functions to train and test the model over n epochs
    for epoch in range(1, epochs):
        loss = train()
        val_f1 = test(val_loader)
        test_f1 = test(test_loader)
        print('Epoch: {:02d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'.format(
            epoch, loss, val_f1, test_f1))

    #torch.save(model.state_dict(), model_path)
    #model = Net()
    #model.load_state_dict(torch.load(model_path))

    return model
Example #23
0
 def _download(self):
     # Downloads and stores in raw_dir
     PPI(root=self.raw_dir, split='train')
     PPI(root=self.raw_dir, split='val')
     PPI(root=self.raw_dir, split='test')
Example #24
0
def TrainingNet(dataset, modelName, params, num_pre_epochs, num_epochs,
                NumCutoff, optimizerName, MonteSize, savepath):
    Batch_size = int(params[0])
    root = '/git/data/GraphData/' + dataset
    TestAccs = []
    for Monte_iter in range(MonteSize):
        # Data
        start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        NewNetworkSizeAdjust = []
        WeightsDynamicsEvolution = []
        # model
        if dataset == 'Cora' or dataset == 'Citeseer' or dataset == 'Pubmed':
            datasetroot = Planetoid(root=root,
                                    name=dataset,
                                    transform=T.NormalizeFeatures()).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)
            criterion = nn.CrossEntropyLoss()

        elif dataset == "CoraFull":
            datasetroot = CoraFull(root=root,
                                   transform=T.NormalizeFeatures()).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == "Amazon":
            datasetroot = Amazon(root,
                                 "Photo",
                                 transform=None,
                                 pre_transform=None)
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == 'ENZYMES' or dataset == 'MUTAG':
            datasetroot = TUDataset(root, name=dataset, use_node_attr=True)
            Num = len(datasetroot) // 10
            global train_dataset, test_dataset
            train_dataset = datasetroot[:Num]
            test_dataset = datasetroot[Num:]
            trainloader = DataLoader(train_dataset, batch_size=Batch_size)
            testloader = DataLoader(test_dataset, batch_size=60)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == "PPI":
            train_dataset = PPI(root, split='train')
            test_dataset = PPI(root, split='test')
            trainloader = DataLoader(train_dataset,
                                     batch_size=Batch_size,
                                     shuffle=True)
            testloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, train_dataset,
                                           params, num_epochs)
            criterion = torch.nn.BCEWithLogitsLoss()

        elif dataset == "Reddit":
            datasetroot = Reddit(root)
            trainloader = DataListLoader(datasetroot,
                                         batch_size=1,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=2,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)
            criterion = torch.nn.BCEWithLogitsLoss()

        elif dataset == 'MNIST':
            datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian())
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == 'CIFAR10':
            pass
        else:
            raise Exception("Input wrong datatset!!")

        FileName = "{}-{}-param_{}_{}_{}_{}-monte_{}".format(
            dataset, modelName, params[0], params[1], params[2], params[3],
            Monte_iter)

        print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
        global device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        optimizer = optim.Adam(net.parameters(),
                               lr=params[3],
                               betas=(0.9, 0.999),
                               eps=1e-08,
                               weight_decay=0,
                               amsgrad=False)
        criterion = nn.CrossEntropyLoss()
        net = net.to(device)

        #cudnn.benchmark = True
        logging(
            'dataset:{}, Batch size: {}, Number of layers:{} ConCoeff: {}, LR:{}, MonteSize:{}'
            .format(dataset, params[0], params[1], params[2], params[3],
                    Monte_iter))
        mark = "{}/{}Convergence/DiagElement-{}".format(
            savepath, dataset, FileName)

        PreTrainConvergence, PreTestConvergence, PreTestAcc = TrainPart(
            modelName, datasetroot, start_epoch, num_pre_epochs, trainloader,
            testloader, net, optimizer, criterion, NumCutoff, mark, False,
            model_to_save)
        print(
            'dataset: {}, model name: {}, Number epoches: {},  Pre-train error is: {}, Pre-test error is: {}, test acc is {}'
            .format(dataset, modelName, num_pre_epochs,
                    PreTrainConvergence[-1], PreTestConvergence[-1],
                    PreTestAcc[-1]))

        NewNetworksize, NewNetworkWeight = RetainNetworkSize(net,
                                                             params[2])[0:2]
        NetworkInfo = [NewNetworksize[0:-1], NewNetworkWeight]
        OptimizedNet = ChooseModel(modelName, datasetroot, NetworkInfo)
        NewNetworksize.insert(0, datasetroot.num_features)
        NewNetworkSizeAdjust.append(NewNetworksize[0:-1])
        print(NewNetworkSizeAdjust)

        #OptimizedNet.apply(init_weights)
        #OptimizedNet = DataParallel(OptimizedNet)
        OptimizedNet = OptimizedNet.to(device)
        cudnn.benchmark = True
        criterionNew = nn.CrossEntropyLoss()
        if optimizerName == "SGD":
            optimizerNew = getattr(optim,
                                   optimizerName)(OptimizedNet.parameters(),
                                                  lr=params[3],
                                                  momentum=0.9,
                                                  weight_decay=5e-4)
        elif optimizerName == "Adam":
            optimizerNew = getattr(optim,
                                   optimizerName)(OptimizedNet.parameters(),
                                                  lr=params[3],
                                                  betas=(0.9, 0.999),
                                                  eps=1e-08,
                                                  weight_decay=5e-4,
                                                  amsgrad=False)

        TrainConvergence, TestConvergence, TestAcc = TrainPart(
            modelName, datasetroot, start_epoch, num_epochs, trainloader,
            testloader, OptimizedNet, optimizerNew, criterionNew, NumCutoff,
            mark, True, model_to_save)
        np.save(
            "{}/{}Convergence/TrainConvergence-{}".format(
                savepath, dataset, FileName), TrainConvergence)
        np.save(
            "{}/{}Convergence/TestConvergence-{}".format(
                savepath, dataset, FileName), TestConvergence)

        np.save(
            "{}/{}Convergence/NewNetworkSizeAdjust-{}".format(
                savepath, dataset, FileName), NewNetworkSizeAdjust)

        #np.save(savepath+'TestConvergence-'+FileName,TestConvergence)
        #torch.cuda.empty_cache()

        print(
            'dataset: {}, model name:{}, resized network size is {},  Number epoches:{},  Train error is: {}, Test error is: {}, test acc is {}\n'
            .format(dataset, modelName, NewNetworksize[0:-1], num_epochs,
                    TrainConvergence[-1], TestConvergence[-1], TestAcc[-1]))
        TestAccs.append(TestAcc)
        np.save(
            "{}/{}Convergence/MeanTestAccs-{}".format(savepath, dataset,
                                                      FileName), TestAccs)
    print("The change of test error is:{}".format(TestAccs))
    print_nvidia_useage()
Example #25
0
import os.path as osp

import torch
from torch.nn import Linear
import torch.nn.functional as F
from sklearn.metrics import f1_score
from torch_geometric.datasets import PPI
import torch_geometric.transforms as T
from torch_geometric.nn import GCN2Conv
from torch_geometric.loader import DataLoader

path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'GCN2_PPI')
pre_transform = T.Compose([T.GCNNorm(), T.ToSparseTensor()])
train_dataset = PPI(path, split='train', pre_transform=pre_transform)
val_dataset = PPI(path, split='val', pre_transform=pre_transform)
test_dataset = PPI(path, split='test', pre_transform=pre_transform)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)


class Net(torch.nn.Module):
    def __init__(self, hidden_channels, num_layers, alpha, theta,
                 shared_weights=True, dropout=0.0):
        super(Net, self).__init__()

        self.lins = torch.nn.ModuleList()
        self.lins.append(Linear(train_dataset.num_features, hidden_channels))
        self.lins.append(Linear(hidden_channels, train_dataset.num_classes))

        self.convs = torch.nn.ModuleList()
def load_dataset(name):
    """ Load real-world datasets, available in PyTorch Geometric.

    Used as a helper for DiskDataSource.
    """
    task = "graph"
    if name == "enzymes":
        dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
    elif name == "proteins":
        dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS")
    elif name == "cox2":
        dataset = TUDataset(root="/tmp/cox2", name="COX2")
    elif name == "aids":
        dataset = TUDataset(root="/tmp/AIDS", name="AIDS")
    elif name == "reddit-binary":
        dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY")
    elif name == "imdb-binary":
        dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY")
    elif name == "firstmm_db":
        dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB")
    elif name == "dblp":
        dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1")
    elif name == "ppi":
        dataset = PPI(root="/tmp/PPI")
    elif name == "qm9":
        dataset = QM9(root="/tmp/QM9")
    elif name == "atlas":
        dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)]
    elif name == 'aifb':
        dataset = Entities(root="/tmp/aifb", name='AIFB')  # 90 edge types
    elif name == 'wn18':
        dataset = WordNet18(root="/tmp/wn18")
    elif name == 'fb15k237':
        dataset = [None]
    if task == "graph":
        train_len = int(0.8 * len(dataset))
        train, test = [], []
        if name not in ['aifb', 'wn18', 'fb15k237']:
            dataset = list(dataset)
            random.shuffle(dataset)
            has_name = hasattr(dataset[0], "name")
        else:
            has_name = True
        for i, graph in tqdm(enumerate(dataset)):
            if not type(graph) == nx.Graph:
                try:
                    if has_name: del graph.name
                except:
                    pass
                if name == 'aifb':
                    graph = pyg_utils.to_networkx(graph,
                                                  edge_attrs=['edge_type'])
                elif name == 'wn18':
                    graph = pyg_utils.to_networkx(graph,
                                                  edge_attrs=['edge_type'])
                elif name == 'fb15k237':
                    data = FB15k_237()
                    (graph, _, _, _) = data.load()
                    graph = graph.to_networkx()
                    edge_type_dict = []
                    for j in graph.edges:
                        edge_type_dict.append(graph.edges[j]['label'])
                    edge_type_dict = {
                        i: ind
                        for ind, i in enumerate(sorted(set(edge_type_dict)))
                    }

                    for j in graph.edges:
                        graph.edges[j]['edge_type'] = edge_type_dict[
                            graph.edges[j]['label']]
                        del graph.edges[j]['label']
                        del graph.edges[j]['weight']
                else:
                    graph = pyg_utils.to_networkx(graph).to_undirected()
            if name == 'aifb':
                train.append(graph)
                test.append(deepcopy(graph))
            elif name == 'wn18':
                train.append(graph)
                test.append(deepcopy(graph))
            elif name == 'fb15k237':
                train.append(graph)
                test.append(deepcopy(graph))
            else:
                if i < train_len:
                    train.append(graph)
                else:
                    test.append(graph)

    return train, test, task
Example #27
0
def process_inductive(dataset, gnn_type="GCNConv", K=None, random_init=False, runs=10):

    hyperparameters = get_hyperparameters()
    nb_epochs = hyperparameters["nb_epochs"]
    patience = hyperparameters["patience"]
    lr = hyperparameters["lr"]
    l2_coef = hyperparameters["l2_coef"]
    drop_prob = hyperparameters["drop_prob"]
    hid_units = hyperparameters["hid_units"]
    nonlinearity = hyperparameters["nonlinearity"]
    batch_size = hyperparameters["batch_size"]

    norm_features = torch_geometric.transforms.NormalizeFeatures()
    dataset_train = PPI(
        "./geometric_datasets/"+dataset,
        split="train",
        transform=norm_features,
    )
    print(dataset_train)
    dataset_val = PPI(
        "./geometric_datasets/"+dataset,
        split="val",
        transform=norm_features,
    )
    print(dataset_val)
    dataset_test = PPI(
        "./geometric_datasets/"+dataset,
        split="test",
        transform=norm_features,
    )
    data = []
    for d in dataset_train:
        data.append(d)
    for d in dataset_val:
        data.append(d)

    ft_size = dataset_train[0].x.shape[1]
    nb_classes = dataset_train[0].y.shape[1] # multilabel
    b_xent = nn.BCEWithLogitsLoss()

    loader_train = DataLoader(
        data,
        batch_size=hyperparameters["batch_size"],
        shuffle=True,
    )
    loader_test = DataLoader(
        dataset_test,
        batch_size=hyperparameters["batch_size"],
        shuffle=False
    )

    all_accs = []
    for _ in range(runs):
        model = DGI(ft_size, hid_units, nonlinearity, update_rule=gnn_type, batch_size=1, K=K)
        model_name = get_model_name(dataset, gnn_type, K, random_init=random_init)
        print(model)
        optimiser = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_coef)

        if torch.cuda.is_available():
            print('Using CUDA')
            model = model.cuda()
        model.train()

        torch.cuda.empty_cache()
        for epoch in range(20):
            if random_init:
                break
            total_loss = 0
            batch_id = 0
            model.train()
            loaded = list(loader_train)
            for batch in loaded:
                optimiser.zero_grad()
                if torch.cuda.is_available:
                    batch = batch.to('cuda')
                nb_nodes = batch.x.shape[0]
                features = batch.x
                labels = batch.y
                edge_index = batch.edge_index

                idx = np.random.randint(0, len(data))
                while idx == batch_id:
                    idx = np.random.randint(0, len(data))
                shuf_fts = torch.nn.functional.dropout(loaded[idx].x, drop_prob)
                edge_index2 = loaded[idx].edge_index

                lbl_1 = torch.ones(nb_nodes)
                lbl_2 = torch.zeros(shuf_fts.shape[0])
                lbl = torch.cat((lbl_1, lbl_2), 0)

                if torch.cuda.is_available():
                    shuf_fts = shuf_fts.cuda()
                    if edge_index2 is not None:
                        edge_index2 = edge_index2.cuda()
                    lbl = lbl.cuda()
                
                logits = model(features, shuf_fts, edge_index, batch=batch.batch, edge_index_alt=edge_index2)

                loss = b_xent(logits, lbl)
                loss.backward()
                optimiser.step()
                batch_id += 1
                total_loss += loss.item()


            print(epoch, 'Train Loss:', total_loss/(len(dataset_train)))

        torch.save(model.state_dict(), './trained_models/'+model_name)
        torch.cuda.empty_cache()

        print('Loading last epoch')
        if not random_init:
            model.load_state_dict(torch.load('./trained_models/'+model_name))
        model.eval()

        b_xent_reg = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.25))
        train_embs, whole_train_data = preprocess_embeddings(model, dataset_train)
        val_embs, whole_val_data = preprocess_embeddings(model, dataset_val)
        test_embs, whole_test_data = preprocess_embeddings(model, dataset_test)

        for _ in range(50):
            log = LogReg(hid_units, nb_classes)
            opt = torch.optim.Adam(log.parameters(), lr=0.01, weight_decay=0.0)
            log.cuda()

            pat_steps = 0
            best = 1e9
            log.train()
            for _ in range(250):
                opt.zero_grad()

                logits = log(train_embs)
                loss = b_xent_reg(logits, whole_train_data.y)
                
                loss.backward()
                opt.step()

                log.eval()
                val_logits = log(val_embs) 
                loss = b_xent_reg(val_logits, whole_val_data.y)
                if loss.item() < best:
                    best = loss.item()
                    pat_steps = 0
                if pat_steps >= 5:
                    break

                pat_steps += 1


            log.eval()
            logits = log(test_embs)
            preds = torch.sigmoid(logits) > 0.5
            f1 = sklearn.metrics.f1_score(whole_test_data.y.cpu(), preds.long().cpu(), average='micro')
            all_accs.append(float(f1))
            print()
            print('Micro-averaged f1:', f1)

    all_accs = torch.tensor(all_accs)

    with open("./results/"+model_name[:-4]+"_results.txt", "w") as f:
        f.writelines([str(all_accs.mean().item())+'\n', str(all_accs.std().item())])
    print(all_accs.mean())
    print(all_accs.std())
Example #28
0
 def prepare_data(self):
     path = osp.join(osp.dirname(osp.realpath(__file__)), "..", "..",
                     "data", self.NAME)
     self.train_dataset = PPI(path, split="train")
     self.val_dataset = PPI(path, split="val")
     self.test_dataset = PPI(path, split="test")
Example #29
0
    def forward(self, x, edge_index):

        # Apply Dropout to the input features as in the paper
        x = F.dropout(x, p=0., training=self.training)

        H1skip = self.Wskip(x)
        h_1_1 = self.prelu(self.gcn1(x, edge_index))
        h_1_2 = self.prelu(self.gcn2(h_1_1 + H1skip, edge_index))
        h_1_3 = self.prelu(self.gcn3(h_1_2 + H1skip, edge_index))

        x = self.prelu(h_1_3)
        return x


dataset = PPI('train')

test_set = PPI('test')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# todo training on the first graph only!
#data = dataset[0].to(device)

# Used for other datasets
#def corruption(x, edge_index):
#    return x[torch.randperm(x.size(0))], edge_index


# Used for PPI
def corruption(x, edge_index):
Example #30
0
def Load_Dataset(Name):
    path = osp.join(osp.realpath(__file__), '..', 'data', 'PPIdataset')
    if Name == "PPI":
        Train_Dataset = PPI(path, split="train")
        Valid_Dataset = PPI(path, split="val")
        return Train_Dataset, Valid_Dataset