Exemple #1
0
def load_data(args):
    dataset = args.input
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)

    if dataset in ['cora', 'citeseer', 'pubmed']:
        dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
        num_features = dataset.num_features
        num_classes = dataset.num_classes
        data = dataset[0]
        return data, num_features, num_classes
    elif dataset == 'corafull':
        dataset = CoraFull(path)
    elif dataset in ['cs', 'physics']:
        dataset = Coauthor(path, name=dataset)
    elif dataset in ['computers', 'photo']:
        dataset = Amazon(path, name=dataset)
    elif dataset == 'reddit':
        dataset = Reddit(path)
        num_features = dataset.num_features
        num_classes = dataset.num_classes
        data = dataset[0]
        return data, num_features, num_classes
    num_features = dataset.num_features
    num_classes = dataset.num_classes
    data = dataset[0]

    data.train_mask, data.val_mask, data.test_mask = generate_split(
        data, num_classes)

    return data, num_features, num_classes
Exemple #2
0
    def _load_data_small_graphs(self):
        # m here complete, fill the model runner from reg notmnt, and configurations and loss
        data_transform = None
        print("loading data")
        self._data_path = './DataSets/{}'.format(dataSetName)
        if dataSetName == "CoraFull":
            self._data_set = CoraFull(self._data_path)
        elif dataSetName in {"CS", "Physics"}:
            self._data_set = Coauthor(self._data_path, dataSetName)
        else:
            self._data_set = Planetoid(self._data_path, dataSetName,
                                       data_transform)

        self._data_set.data.to(self._device)
        self._data = self._data_set[0]
        self._labels = self._data.y
        self._num_classes = self._data_set.num_classes
        self._g = self.create_graph()

        if BOW:
            self._X = self._data.x
            self.in_features = self._data.num_features

        else:  #2k vectors input
            self._X = None
            self.in_features = num_classes * 2
            self._num_classes = num_classes
Exemple #3
0
def load_dataset(dataset):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)

    if dataset in ['cora', 'citeseer', 'pubmed']:
        dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
        num_features = dataset.num_features
        num_classes = dataset.num_classes
        data = dataset[0]
        data.adj = torch.zeros((data.x.size(0), data.x.size(0)))
        col, row = data.edge_index
        data.adj[col, row] = 1
        return data, num_features, num_classes
    elif dataset == 'reddit':
        dataset = Reddit(path)
    elif dataset == 'corafull':
        dataset = CoraFull(path)
    num_features = dataset.num_features
    num_classes = dataset.num_classes
    data = dataset[0]

    data.train_mask, data.val_mask, data.test_mask = generate_split(
        data, num_classes)
    data.adj = torch.zeros((data.x.size[0], data.x.size(0)))
    col, row = data.edge_index
    data.adj[col, row] = 1
    return data, num_features, num_classes
Exemple #4
0
def load_dataset(name):
    if name in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(root='./data/'+name, name=name)
    elif name == "CoraFull":
        dataset = CoraFull(root='./data/'+name)
    elif name in ["Computers", "Photo"]:
        dataset = Amazon(root='./data/'+name, name=name)
    elif name in ["CS", "Physics"]:
        dataset = Coauthor(root='./data/'+name, name=name)
    else:
        exit("wrong dataset")
    return dataset
Exemple #5
0
    def load_data(self):
        data_name = self._params['data_name']
        if self._params['net'] in {'combined', 'symmetric', 'asymmetric', 'combined_gcn'}:
            self._data_path = './data/{}'.format(data_name)
            gnx = nx.read_gpickle("./data/{}/gnx.pkl".format(data_name))
            bow = pickle.load(open("./data/{}/content.pkl".format(data_name), "rb"))
            nodes = sorted(gnx.nodes)
            dict = {x: i for i, x in enumerate(nodes)}
            x = torch.Tensor(np.vstack([bow[node] for node in nodes])).to(self._device)
            y = torch.LongTensor([gnx.nodes[node]['label'] for node in nodes]).to(self._device)
            edges = torch.LongTensor(np.vstack([[dict[x[0]] for x in gnx.edges],
                                               [dict[x[1]] for x in gnx.edges]])).to(self._device)
            self._data = Data(x=x, edge_index=edges, y=y)
            self._num_features = x.shape[1]
            self._num_classes = len(gnx.graph['node_labels'])

            # Adjacency matrices
            adj = nx.adjacency_matrix(gnx, nodelist=nodes).astype(np.float32)
            if self._params['net'] == 'symmetric':
                self._adj = handle_matrix_symmetric(adj)
                self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense().to(self._device)
            else:
                self._adj = handle_matrix_concat(adj, should_normalize=True)
                self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense().to(self._device)

            return self._data

        data_transform = T.NormalizeFeatures() if self._params['norm'] == True else None
        self._data_path = './DataSets/{}'.format(data_name)
        if data_name == "CoraFull":
            self._data_set = CoraFull(self._data_path)
        elif data_name in {"CS", "Physics"}:
            self._data_set = Coauthor(self._data_path, data_name)
        else:
            self._data_set = Planetoid(self._data_path, data_name, data_transform)
        self._data_set.data.to(self._device)
        self._data = self._data_set[0]
        # self._data = self._data_set.data

        self._num_features = self._data_set.num_features
        self._num_classes = self._data_set.num_classes

        return self._data
Exemple #6
0
def load_pyg_dataset(dataset_name, root='dataset/'):
    from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
    source, name = dataset_name.split('-', maxsplit=1)
    assert source in ['ogbn', 'pyg', 'custom']
    if source == 'ogbn':
        dataset = PygNodePropPredDataset(name=dataset_name, root=root)
        return dataset, dataset.get_idx_split(), Evaluator(dataset_name)
    elif source == 'pyg':
        from torch_geometric.datasets import KarateClub, CoraFull
        if name == "karate":
            dataset = KarateClub()
        elif name == "cora":
            dataset = CoraFull(root)
        else:
            raise Exception("Dataset not recognized")

        num_nodes = dataset[0].x.shape[0]
        num_train = int(num_nodes * 0.8)
        num_val = int(num_nodes * 0.1)

        perm = np.arange(num_nodes, dtype=int)
        np.random.shuffle(perm)
        split_idx = {
            'train': perm[:num_train],
            'valid': perm[num_train:num_train + num_val],
            'test': perm[num_train + num_val:]
        }
        return dataset, split_idx, Evaluator('ogbn-arxiv')
    elif source == "custom":
        from dataset import registry
        dataset = registry[name]()
        split_idx = {
            'train': dataset[0].idx_train,
            'valid': dataset[0].idx_val,
            'test': dataset[0].idx_test
        }
        return dataset, split_idx, CustomEvaluator()

    else:
        raise Exception("Dataset not recognized")
Exemple #7
0
def load_data(dataset_name):
    """
    Loads required data set and normalizes features.
    Implemented data sets are any of type Planetoid and Reddit.
    :param dataset_name: Name of data set
    :return: Tuple of dataset and extracted graph
    """
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                    dataset_name)

    if dataset_name == 'cora_full':
        dataset = CoraFull(path, T.NormalizeFeatures())
    elif dataset_name.lower() == 'coauthor':
        dataset = Coauthor(path, 'Physics', T.NormalizeFeatures())
    elif dataset_name.lower() == 'reddit':
        dataset = Reddit(path, T.NormalizeFeatures())
    elif dataset_name.lower() == 'amazon':
        dataset = Amazon(path)
    else:
        dataset = Planetoid(path, dataset_name, T.NormalizeFeatures())

    print(f"Loading data set {dataset_name} from: ", path)
    data = dataset[0]  # Extract graph
    return dataset, data
    'dataset': [],
    'run': [],
    'n_epochs': [],
    'method': [],
    'accuracy': [],
    'mean': [],
    'std_dev': []
}

for dataset_name in ['PubMed', 'Cora']:  # CoraFull omitted for this run
    # define data
    #dataset_name = 'Cora' # 'PubMed', 'CoraFull'
    path = osp.join(os.getcwd(), '..', 'data', dataset_name)

    if (dataset_name == 'CoraFull'):
        dataset = CoraFull(path, T.NormalizeFeatures())
    else:
        dataset = Planetoid(path, dataset_name, T.NormalizeFeatures())

    data = dataset[0]
    data.batch = None
    data.adj = to_dense_adj(data.edge_index)
    # data

    training_fraction = 0.05
    if dataset_name == 'CoraFull':
        data.test_mask = torch.empty(size=torch.Size([data.x.shape[0]]),
                                     dtype=torch.bool)
        data.val_mask = torch.empty(size=torch.Size([data.x.shape[0]]),
                                    dtype=torch.bool)
        data.train_mask = torch.empty(size=torch.Size([data.x.shape[0]]),
Exemple #9
0
pr = parser.parse_args()

label_ids = defaultdict(list)

if pr.net == 1:
    print("Data Cora")
    _data = Planetoid(root="./pcora", name="Cora")
elif pr.net == 2:
    print("Data CiteSeer")
    _data = Planetoid(root="./pciteseer", name="Citeseer")
elif pr.net == 3:
    print("Data Pubmed")
    _data = Planetoid(root="./ppubmed", name="Pubmed")
elif pr.net == 4:
    print("Data CoraFull")
    _data = CoraFull("./Corafull")
elif pr.net == 5:
    print("Data Coauthor CS")
    _data = Coauthor("./CS", "CS")
elif pr.net == 6:
    print("Data Coauthor Physics")
    _data = Coauthor("./Physics", "Physics")
elif pr.net == 7:
    print("Data Amazon Computer")
    _data = Amazon("./Computer", "Computers")
elif pr.net == 8:
    print("Data Amazon Photos")
    _data = Amazon("./Photo", "Photo")

#_data = Coauthor("./Physics","Physics")
#_data = Coauthor("./CS","CS")
Exemple #10
0
from torch_geometric.datasets import Planetoid, CoraFull

for dataset_name in ['Cora', 'PubMed', 'CoraFull']:
    print(dataset_name)

    if dataset_name == 'CoraFull':
        dataset = CoraFull(root='/tmp/CoraFull')
    elif dataset_name == 'PubMed':
        dataset = Planetoid(root='/tmp/PubMed', name=dataset_name)
    else:
        dataset = Planetoid(root='/tmp/Cora', name=dataset_name)

    print("num classes=", dataset.num_classes)

    data = dataset[0]
    print("num nodes=", data.num_nodes)

    print("num edges=", data.num_edges / 2)

    print("num features=", dataset.num_node_features)
Exemple #11
0
def TrainingNet(dataset, modelName, params, num_pre_epochs, num_epochs,
                NumCutoff, optimizerName, MonteSize, savepath):
    Batch_size = int(params[0])
    root = '/git/data/GraphData/' + dataset
    TestAccs = []
    for Monte_iter in range(MonteSize):
        # Data
        start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        NewNetworkSizeAdjust = []
        WeightsDynamicsEvolution = []
        # model
        if dataset == 'Cora' or dataset == 'Citeseer' or dataset == 'Pubmed':
            datasetroot = Planetoid(root=root,
                                    name=dataset,
                                    transform=T.NormalizeFeatures()).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)
            criterion = nn.CrossEntropyLoss()

        elif dataset == "CoraFull":
            datasetroot = CoraFull(root=root,
                                   transform=T.NormalizeFeatures()).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == "Amazon":
            datasetroot = Amazon(root,
                                 "Photo",
                                 transform=None,
                                 pre_transform=None)
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == 'ENZYMES' or dataset == 'MUTAG':
            datasetroot = TUDataset(root, name=dataset, use_node_attr=True)
            Num = len(datasetroot) // 10
            global train_dataset, test_dataset
            train_dataset = datasetroot[:Num]
            test_dataset = datasetroot[Num:]
            trainloader = DataLoader(train_dataset, batch_size=Batch_size)
            testloader = DataLoader(test_dataset, batch_size=60)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == "PPI":
            train_dataset = PPI(root, split='train')
            test_dataset = PPI(root, split='test')
            trainloader = DataLoader(train_dataset,
                                     batch_size=Batch_size,
                                     shuffle=True)
            testloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, train_dataset,
                                           params, num_epochs)
            criterion = torch.nn.BCEWithLogitsLoss()

        elif dataset == "Reddit":
            datasetroot = Reddit(root)
            trainloader = DataListLoader(datasetroot,
                                         batch_size=1,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=2,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)
            criterion = torch.nn.BCEWithLogitsLoss()

        elif dataset == 'MNIST':
            datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian())
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == 'CIFAR10':
            pass
        else:
            raise Exception("Input wrong datatset!!")

        FileName = "{}-{}-param_{}_{}_{}_{}-monte_{}".format(
            dataset, modelName, params[0], params[1], params[2], params[3],
            Monte_iter)

        print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
        global device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        optimizer = optim.Adam(net.parameters(),
                               lr=params[3],
                               betas=(0.9, 0.999),
                               eps=1e-08,
                               weight_decay=0,
                               amsgrad=False)
        criterion = nn.CrossEntropyLoss()
        net = net.to(device)

        #cudnn.benchmark = True
        logging(
            'dataset:{}, Batch size: {}, Number of layers:{} ConCoeff: {}, LR:{}, MonteSize:{}'
            .format(dataset, params[0], params[1], params[2], params[3],
                    Monte_iter))
        mark = "{}/{}Convergence/DiagElement-{}".format(
            savepath, dataset, FileName)

        PreTrainConvergence, PreTestConvergence, PreTestAcc = TrainPart(
            modelName, datasetroot, start_epoch, num_pre_epochs, trainloader,
            testloader, net, optimizer, criterion, NumCutoff, mark, False,
            model_to_save)
        print(
            'dataset: {}, model name: {}, Number epoches: {},  Pre-train error is: {}, Pre-test error is: {}, test acc is {}'
            .format(dataset, modelName, num_pre_epochs,
                    PreTrainConvergence[-1], PreTestConvergence[-1],
                    PreTestAcc[-1]))

        NewNetworksize, NewNetworkWeight = RetainNetworkSize(net,
                                                             params[2])[0:2]
        NetworkInfo = [NewNetworksize[0:-1], NewNetworkWeight]
        OptimizedNet = ChooseModel(modelName, datasetroot, NetworkInfo)
        NewNetworksize.insert(0, datasetroot.num_features)
        NewNetworkSizeAdjust.append(NewNetworksize[0:-1])
        print(NewNetworkSizeAdjust)

        #OptimizedNet.apply(init_weights)
        #OptimizedNet = DataParallel(OptimizedNet)
        OptimizedNet = OptimizedNet.to(device)
        cudnn.benchmark = True
        criterionNew = nn.CrossEntropyLoss()
        if optimizerName == "SGD":
            optimizerNew = getattr(optim,
                                   optimizerName)(OptimizedNet.parameters(),
                                                  lr=params[3],
                                                  momentum=0.9,
                                                  weight_decay=5e-4)
        elif optimizerName == "Adam":
            optimizerNew = getattr(optim,
                                   optimizerName)(OptimizedNet.parameters(),
                                                  lr=params[3],
                                                  betas=(0.9, 0.999),
                                                  eps=1e-08,
                                                  weight_decay=5e-4,
                                                  amsgrad=False)

        TrainConvergence, TestConvergence, TestAcc = TrainPart(
            modelName, datasetroot, start_epoch, num_epochs, trainloader,
            testloader, OptimizedNet, optimizerNew, criterionNew, NumCutoff,
            mark, True, model_to_save)
        np.save(
            "{}/{}Convergence/TrainConvergence-{}".format(
                savepath, dataset, FileName), TrainConvergence)
        np.save(
            "{}/{}Convergence/TestConvergence-{}".format(
                savepath, dataset, FileName), TestConvergence)

        np.save(
            "{}/{}Convergence/NewNetworkSizeAdjust-{}".format(
                savepath, dataset, FileName), NewNetworkSizeAdjust)

        #np.save(savepath+'TestConvergence-'+FileName,TestConvergence)
        #torch.cuda.empty_cache()

        print(
            'dataset: {}, model name:{}, resized network size is {},  Number epoches:{},  Train error is: {}, Test error is: {}, test acc is {}\n'
            .format(dataset, modelName, NewNetworksize[0:-1], num_epochs,
                    TrainConvergence[-1], TestConvergence[-1], TestAcc[-1]))
        TestAccs.append(TestAcc)
        np.save(
            "{}/{}Convergence/MeanTestAccs-{}".format(savepath, dataset,
                                                      FileName), TestAccs)
    print("The change of test error is:{}".format(TestAccs))
    print_nvidia_useage()
Exemple #12
0
def get_small_dataset(dataset_name,
                      normalize_attributes=False,
                      add_self_loops=False,
                      remove_isolated_nodes=False,
                      make_undirected=False,
                      graph_availability=None,
                      seed=0,
                      create_adjacency_lists=True):
    """
    Get the pytorch_geometric.data.Data object associated with the specified dataset name.
    :param dataset_name: str => One of the datasets mentioned below.
    :param normalize_attributes: Whether the attributes for each node should be normalized to sum to 1.
    :param add_self_loops: Add self loops to the input Graph.
    :param remove_isolated_nodes: Remove isolated nodes.
    :param make_undirected: Make the Graph undirected.
    :param graph_availability: Either inductive and transductive. If transductive, all the graph nodes are available
                               during training. Otherwise, only training split nodes are available.
    :param seed: The random seed to use while splitting into train/val/test splits.
    :param create_adjacency_lists: Whether to process and store adjacency lists that can be used for efficient
                                   r-radius neighborhood sampling.
    :return: A pytorch_geometric.data.Data object for that dataset.
    """
    assert dataset_name in {
        'amazon-computers', 'amazon-photo', 'citeseer', 'coauthor-cs',
        'coauthor-physics', 'cora', 'cora-full', 'ppi', 'pubmed', 'reddit'
    }
    assert graph_availability in {'inductive', 'transductive'}

    # Compose transforms that should be applied.
    transforms = []
    if normalize_attributes:
        transforms.append(NormalizeFeatures())
    if remove_isolated_nodes:
        transforms.append(RemoveIsolatedNodes())
    if add_self_loops:
        transforms.append(AddSelfLoops())
    transforms = Compose(transforms) if transforms else None

    # Load the specified dataset and apply transforms.
    root_dir = '/tmp/{dir}'.format(dir=dataset_name)
    processed_dir = os.path.join(root_dir, dataset_name, 'processed')
    # Remove any previously pre-processed data, so pytorch_geometric can pre-process it again.
    if os.path.exists(processed_dir) and os.path.isdir(processed_dir):
        shutil.rmtree(processed_dir)

    data = None

    def split_function(y):
        return _get_train_val_test_masks(y.shape[0], y, 0.2, 0.2, seed)

    if dataset_name in ['citeseer', 'cora', 'pubmed']:
        data = Planetoid(root=root_dir,
                         name=dataset_name,
                         pre_transform=transforms,
                         split='full').data
        if seed != 0:
            data.train_mask, data.val_mask, data.test_mask = split_function(
                data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'cora-full':
        data = CoraFull(root=root_dir, pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'amazon-computers':
        data = Amazon(root=root_dir,
                      name='Computers',
                      pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'amazon-photo':
        data = Amazon(root=root_dir, name='Photo',
                      pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'coauthor-cs':
        data = Coauthor(root=root_dir, name='CS',
                        pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'coauthor-physics':
        data = Coauthor(root=root_dir,
                        name='Physics',
                        pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'reddit':
        data = Reddit(root=root_dir, pre_transform=transforms).data
        if seed != 0:
            data.train_mask, data.val_mask, data.test_mask = split_function(
                data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'ppi':
        data = SimpleNamespace()
        data.graphs = []
        for split in ['train', 'val', 'test']:
            split_data = PPI(root=root_dir,
                             split=split,
                             pre_transform=transforms)
            x_idxs = split_data.slices['x'].numpy()
            edge_idxs = split_data.slices['edge_index'].numpy()
            split_data = split_data.data
            for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:],
                                                      edge_idxs,
                                                      edge_idxs[1:]):
                graph = Data(split_data.x[x_start:x_end],
                             split_data.edge_index[:, e_start:e_end],
                             y=split_data.y[x_start:x_end])
                graph.num_nodes = int(x_end - x_start)
                graph.split = split
                all_true = torch.ones(graph.num_nodes).bool()
                all_false = torch.zeros(graph.num_nodes).bool()
                graph.train_mask = all_true if split == 'train' else all_false
                graph.val_mask = all_true if split == 'val' else all_false
                graph.test_mask = all_true if split == 'test' else all_false
                data.graphs.append(graph)
        if seed != 0:
            temp_random = random.Random(seed)
            val_graphs = temp_random.sample(range(len(data.graphs)), 2)
            test_candidates = [
                graph_idx for graph_idx in range(len(data.graphs))
                if graph_idx not in val_graphs
            ]
            test_graphs = temp_random.sample(test_candidates, 2)
            for graph_idx, graph in enumerate(data.graphs):
                all_true = torch.ones(graph.num_nodes).bool()
                all_false = torch.zeros(graph.num_nodes).bool()
                graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train'
                graph.train_mask = all_true if graph.split == 'train' else all_false
                graph.val_mask = all_true if graph.split == 'val' else all_false
                graph.test_mask = all_true if graph.split == 'test' else all_false

    if make_undirected:
        for graph in data.graphs:
            graph.edge_index = to_undirected(graph.edge_index, graph.num_nodes)

    LOG.info(f'Downloaded and transformed {len(data.graphs)} graph(s).')

    # Populate adjacency lists for efficient k-neighborhood sampling.
    # Only retain edges coming into a node and reverse the edges for the purpose of adjacency lists.
    LOG.info('Processing adjacency lists and degree information.')

    for graph in data.graphs:
        train_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        val_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        test_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        adjacency_lists = defaultdict(list)
        not_val_test_mask = (~graph.val_mask & ~graph.test_mask).numpy()
        val_mask = graph.val_mask.numpy()
        test_mask = graph.test_mask.numpy()

        if create_adjacency_lists:
            num_edges = graph.edge_index[0].shape[0]
            sources, dests = graph.edge_index[0].numpy(
            ), graph.edge_index[1].numpy()
            for source, dest in tqdm(zip(sources, dests),
                                     total=num_edges,
                                     leave=False):
                if not_val_test_mask[dest] and not_val_test_mask[source]:
                    train_in_degrees[dest] += 1
                    val_in_degrees[dest] += 1
                elif val_mask[dest] and not test_mask[source]:
                    val_in_degrees[dest] += 1
                test_in_degrees[dest] += 1
                adjacency_lists[dest].append(source)

        graph.adjacency_lists = dict(adjacency_lists)
        graph.train_in_degrees = torch.from_numpy(train_in_degrees).long()
        graph.val_in_degrees = torch.from_numpy(val_in_degrees).long()
        graph.test_in_degrees = torch.from_numpy(test_in_degrees).long()
        if graph_availability == 'transductive':
            graph.train_in_degrees = data.test_in_degrees
            graph.val_in_degrees = data.test_in_degrees

        graph.graph_availability = graph_availability

        # To accumulate any neighborhood perturbations to the graph.
        graph.perturbed_neighborhoods = defaultdict(set)
        graph.added_nodes = defaultdict(set)
        graph.modified_degrees = {}

        # For small datasets, cache the neighborhoods for all nodes for at least 3 different radii queries.
        graph.use_cache = True
        graph.neighborhood_cache = NeighborhoodCache(graph.num_nodes * 3)

        graph.train_mask_original = graph.train_mask
        graph.val_mask_original = graph.val_mask
        graph.test_mask_original = graph.test_mask

        graph.train_mask = torch.ones(
            graph.num_nodes).bool() & ~graph.val_mask & ~graph.test_mask

    return data
Exemple #13
0
def TrainingNet(dataset,modelName,params,num_pre_epochs,num_epochs,NumCutoff,optimizerName,LinkPredictionMethod,MonteSize,savepath):
    Batch_size=params[0]
    VectorPairs=params[4]
    StartTopoCoeffi=params[5]
    WeightCorrectionCoeffi=params[6]
    interval=params[7]
    root='/git/data/GraphData/'+dataset
    TestAccs=[]
    
    for Monte_iter in range(MonteSize):
        # Data
        NewNetworkSizeAdjust=[]
        WeightsDynamicsEvolution=[]
        trainValRatio=[0.2,0.4]
        # model 
        if dataset=='Cora' or dataset =='Citeseer' or dataset =='Pubmed':
            datasetroot= Planetoid(root=root, name=dataset,transform =T.NormalizeFeatures()).shuffle()    
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)

            """            train_mask, val_mask,test_mask=DataSampler(trainValRatio,datasetroot.data.num_nodes)
            DataMask={}
            DataMask['train_mask']=train_mask
            DataMask['val_mask']=val_mask
            DataMask['test_mask']=test_mask
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)"""
            num_features=datasetroot.num_features
            num_classes=datasetroot.num_classes
            criterion = nn.CrossEntropyLoss()


        elif dataset =="CoraFull":
            datasetroot = CoraFull(root=root,transform =T.NormalizeFeatures()).shuffle()
            """train_mask, val_mask,test_mask=DataSampler(trainValRatio,datasetroot.data.num_nodes)
            DataMask={}
            DataMask['train_mask']=train_mask
            DataMask['val_mask']=val_mask
            DataMask['test_mask']=test_mask"""
            criterion = nn.CrossEntropyLoss()
            num_features=datasetroot.num_features
            num_classes=datasetroot.num_classes
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=False)


        elif dataset=='ENZYMES' or dataset=='MUTAG':
            datasetroot=TUDataset(root,name=dataset,use_node_attr=True)
            trainloader = DataLoader(datasetroot, batch_size=Batch_size, shuffle=True)
            num_features=datasetroot.num_features
            num_classes=datasetroot.num_classes     
            
            
        elif dataset =="PPI":
            train_dataset = PPI(root, split='train')
            val_dataset = PPI(root, split='val')
            test_dataset = PPI(root, split='test')
            trainloader = DataListLoader(train_dataset, batch_size=Batch_size, shuffle=True)
            valloader = DataListLoader(val_dataset, batch_size=100, shuffle=False)
            testloader = DataListLoader(test_dataset, batch_size=100, shuffle=False)
            num_classes=train_dataset.num_classes
            num_features=train_dataset.num_features
            criterion = torch.nn.BCEWithLogitsLoss()

        elif dataset =="Reddit":
            datasetroot=Reddit(root)   
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)
            testloader = DataListLoader(datasetroot, batch_size=2, shuffle=False)

        elif dataset=="Amazon":
            datasetroot=Amazon(root, "Photo", transform=None, pre_transform=None)
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)
            testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False)

        elif dataset=='MNIST':
            datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian())
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)
            testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False)

        elif dataset=='CIFAR10':
            pass
        else:
            raise Exception("Input wrong datatset!!")
        
        width=ContractionLayerCoefficients(num_features,*params[1:3])
        net =ChooseModel(modelName,num_features,num_classes,width)    
        FileName="{}-{}-param_{}_{}_{}_{}-monte_{}".format(dataset,modelName,interval,WeightCorrectionCoeffi,StartTopoCoeffi,VectorPairs,Monte_iter)
        print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        criterion = criterion.to(device)
        net = DataParallel(net)
        net = net.to(device)
        optimizer = getattr(optim,optimizerName)(net.parameters(), lr=params[3], momentum=0.9, weight_decay=5e-4)


        model_to_save='./checkpoint/{}-{}-param_{}_{}_{}_{}-ckpt.pth'.format(dataset,modelName,params[0],params[1],params[5],params[4])
        if resume=="True" and os.path.exists(model_to_save):
            [net,optimizer,TrainConvergence,TestConvergence,Acc]=ResumeModel(net,optimizer,model_to_save)
            start_epoch=len(TrainConvergence)
        else:
            start_epoch = 0  # start from epoch 0 or last checkpoint epoch         

    
                #cudnn.benchmark = True
        logging('dataset:{}, Batch size: {}, Number of layers:{} ConCoeff: {}, LR:{}, MonteSize:{}'.format(dataset, params[0], params[1],params[2],params[3],Monte_iter))
        mark="{}{}Convergence/DiagElement-{}".format(savepath,dataset,FileName)
        markweights="{}{}Convergence/WeightChanges-{}".format(savepath,dataset,FileName)
                     
        PreTrainConvergence,PreTestConvergence,PreAcc=TrainPart(start_epoch,num_pre_epochs,num_classes,                        trainloader,net,optimizer,criterion,NumCutoff,LinkPredictionMethod,VectorPairs,WeightCorrectionCoeffi,StartTopoCoeffi,mark,markweights,model_to_save,False)
        print('dataset: {}, model name:{}, epoch:{},Pre-train error:{}; Pre-test error:{}; test acc:{}'.format(dataset,modelName,num_pre_epochs,PreTrainConvergence[-1],PreTestConvergence[-1],PreAcc))

        NewNetworksize=RetainNetworkSize(net,params[2])
        OptimizedNet=ChooseModel(modelName,num_features,num_classes,NewNetworksize[0:-1])
        NewNetworksize.insert(0,num_features)
        NewNetworkSizeAdjust.append(NewNetworksize[0:-1])
        print(NewNetworkSizeAdjust)

        #OptimizedNet.apply(init_weights)

        OptimizedNet = DataParallel(OptimizedNet)
        OptimizedNet = OptimizedNet.to(device)
        cudnn.benchmark = True
        # Begin Pre training
        if optimizerName =="SGD":
            optimizerNew = getattr(optim,optimizerName)(OptimizedNet.parameters(), lr=params[3], momentum=0.9, weight_decay=5e-4)
        elif optimizerName =="Adam":
            optimizerNew = getattr(optim,optimizerName)(OptimizedNet.parameters(), lr=params[3], betas=(0.9, 0.999), eps=1e-08, weight_decay=5e-4, amsgrad=False)
        TrainConvergence,TestConvergence,TestAcc=TrainPart(start_epoch,num_epochs,datasetroot.num_classes, trainloader,OptimizedNet,optimizerNew,criterion,
                                                                   NumCutoff,LinkPredictionMethod,VectorPairs,WeightCorrectionCoeffi,StartTopoCoeffi,mark,markweights,model_to_save,True)
        np.save("{}/{}Convergence/AlgebraicConectivityTrainConvergence-{}".format(savepath,dataset,FileName),TrainConvergence)
        np.save("{}/{}Convergence/AlgebraicConectivityTestConvergence-{}".format(savepath,dataset,FileName),TestConvergence)

        #np.save("{}/{}Convergence/NewNetworkSizeAdjust-{}".format(savepath,dataset,FileName),NewNetworkSizeAdjust)

        #torch.cuda.empty_cache()
        print('dataset: {}, model name:{}, resized network size: {}, the train error: {},test error: {}, test acc:{}\n'.format(dataset,modelName,NewNetworksize[0:-1],num_epochs,TrainConvergence[-1],TestConvergence[-1],TestAcc))
        np.save("{}/{}Convergence/AlgebraicConectivityMeanTestAccs-{}".format(savepath,dataset,FileName),TestAccs.append(TestAcc))
        TestAccs.append(TestAcc)
        print_nvidia_useage()