コード例 #1
0
ファイル: loader.py プロジェクト: youngflyasd/GraphGym
def load_dataset():
    '''
    load raw datasets.
    :return: a list of networkx/deepsnap graphs, plus additional info if needed
    '''
    format = cfg.dataset.format
    name = cfg.dataset.name
    # dataset_dir = '{}/{}'.format(cfg.dataset.dir, name)
    dataset_dir = cfg.dataset.dir
    # Try to load customized data format
    for func in register.loader_dict.values():
        graphs = func(format, name, dataset_dir)
        if graphs is not None:
            return graphs
    # Load from Pytorch Geometric dataset
    if format == 'PyG':
        graphs = load_pyg(name, dataset_dir)
    # Load from networkx formatted data
    # todo: clean nx dataloader
    elif format == 'nx':
        graphs = load_nx(name, dataset_dir)
    # Load from OGB formatted data
    elif cfg.dataset.format == 'OGB':
        if cfg.dataset.name == 'ogbg-molhiv':
            dataset = PygGraphPropPredDataset(name=cfg.dataset.name)
            graphs = GraphDataset.pyg_to_graphs(dataset)
        # Note this is only used for custom splits from OGB
        split_idx = dataset.get_idx_split()
        return graphs, split_idx
    else:
        raise ValueError('Unknown data format: {}'.format(cfg.dataset.format))
    return graphs
コード例 #2
0
def get_molhiv():
    path = osp.dirname(osp.realpath(__file__))
    dataset = PygGraphPropPredDataset(name='ogbg-molhiv', root=path)
    split_idx = dataset.get_idx_split()
    max_num_nodes = torch.tensor(dataset.data.num_nodes).max().item()
    return dataset[split_idx["train"]], dataset[split_idx["valid"]], dataset[
        split_idx["test"]], max_num_nodes
コード例 #3
0
ファイル: test.py プロジェクト: zkyzq/deep_gcns_torch
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygGraphPropPredDataset(name=args.dataset)
    args.num_tasks = dataset.num_tasks
    print(args)

    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    model = DeeperGCN(args)

    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    train_result = eval(model, device, train_loader,
                        evaluator)[dataset.eval_metric]
    valid_result = eval(model, device, valid_loader,
                        evaluator)[dataset.eval_metric]
    test_result = eval(model, device, test_loader,
                       evaluator)[dataset.eval_metric]

    print({
        'Train': train_result,
        'Validation': valid_result,
        'Test': test_result
    })

    model.print_params(final=True)
コード例 #4
0
 def setup(self, stage: Optional[str] = None):
     """Load data. Set variables: self.data_train, self.data_val, self.data_test."""
     if not self.data_train and not self.data_val and not self.data_test:
         dataset = PygGraphPropPredDataset(name="ogbg-molpcba",
                                           root=self.data_dir,
                                           transform=self.transform)
         split_idx = dataset.get_idx_split()
         self.data_train = dataset[split_idx["train"]]
         self.data_val = dataset[split_idx["valid"]]
         self.data_test = dataset[split_idx["test"]]
コード例 #5
0
ファイル: data_prepare.py プロジェクト: wizard1203/BenchGNN
 def __init__(self, train):
     super(Mol_pred_DNN_dataset, self).__init__()
     self.train = train
     dataset_name = 'ogbg-molhiv'
     mol_origin_dataset = PygGraphPropPredDataset(name=dataset_name)
     evaluator = Evaluator(name=dataset_name)
     split_idx = mol_origin_dataset.get_idx_split()
     if self.train == True:
         self.mol_origin_dataset = mol_origin_dataset[split_idx["train"]]
     else:
         self.mol_origin_dataset = mol_origin_dataset[split_idx["test"]]
コード例 #6
0
ファイル: data_prepare.py プロジェクト: wizard1203/BenchGNN
def mol_pred_GNN_prepare(batch_size=50):
    dataset_name = 'ogbg-molhiv'

    dataset = PygGraphPropPredDataset(name=dataset_name)
    evaluator = Evaluator(name=dataset_name)

    split_idx = dataset.get_idx_split()
    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=batch_size, shuffle=False)
    return train_loader, test_loader
コード例 #7
0
ファイル: test.py プロジェクト: zkyzq/deep_gcns_torch
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    if args.not_extract_node_feature:
        dataset = PygGraphPropPredDataset(name=args.dataset,
                                          transform=add_zeros)
    else:
        extract_node_feature_func = partial(extract_node_feature,
                                            reduce=args.aggr)
        dataset = PygGraphPropPredDataset(name=args.dataset,
                                          transform=extract_node_feature_func)

    args.num_tasks = dataset.num_classes
    evaluator = Evaluator(args.dataset)

    split_idx = dataset.get_idx_split()

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    print(args)

    model = DeeperGCN(args)
    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    train_accuracy = eval(model, device, train_loader, evaluator)
    valid_accuracy = eval(model, device, valid_loader, evaluator)
    test_accuracy = eval(model, device, test_loader, evaluator)

    print({
        'Train': train_accuracy,
        'Validation': valid_accuracy,
        'Test': test_accuracy
    })
    model.print_params(final=True)
コード例 #8
0
ファイル: utils.py プロジェクト: jingmouren/egc
def mol_data(root, dataset, batch_size=32, num_workers=4):
    dataset = PygGraphPropPredDataset(name=f"ogbg-mol{dataset}", root=root)
    split_idx = dataset.get_idx_split()
    loaders = dict()
    for split in ["train", "valid", "test"]:
        loaders[split] = DataLoader(
            dataset[split_idx[split]],
            batch_size=batch_size,
            shuffle=(split == "train"),
            num_workers=num_workers,
        )
    return loaders
コード例 #9
0
    def train_dataloader(self):
        dataset = PygGraphPropPredDataset(name='ogbg-molhiv')
        split_idx = dataset.get_idx_split()
        train_data = dataset[split_idx["train"]]
        train_loader = DataLoader(train_data,
        batch_size=self.configuration["batch_size"], shuffle=True,
        num_workers = self.configuration["num_workers"])

        self._train_data = train_data
        self._train_loader = train_loader
        
        return train_loader
コード例 #10
0
    def val_dataloader(self):
        dataset = PygGraphPropPredDataset(name='ogbg-molhiv')
        split_idx = dataset.get_idx_split()
        val_data = dataset[split_idx["valid"]]
        validation_loader = DataLoader(val_data,
        batch_size=self.configuration["batch_size"], shuffle=False,
        num_workers = self.configuration["num_workers"])


        self._validation_data = val_data
        self._validation_loader = validation_loader
        
        return validation_loader
コード例 #11
0
    def __init__(self,
                 version=None,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._version = version
        if version is not None:
            raise ValueError(
                'Versioning for OGB-MolPCBA is handled through the OGB package. Please set version=none.'
            )
        # internally call ogb package
        self.ogb_dataset = PygGraphPropPredDataset(name='ogbg-molpcba',
                                                   root=root_dir)

        # set variables
        self._data_dir = self.ogb_dataset.root
        if split_scheme == 'official':
            split_scheme = 'scaffold'
        self._split_scheme = split_scheme
        self._y_type = 'float'  # although the task is binary classification, the prediction target contains nan value, thus we need float
        self._y_size = self.ogb_dataset.num_tasks
        self._n_classes = self.ogb_dataset.__num_classes__

        self._split_array = torch.zeros(len(self.ogb_dataset)).long()
        split_idx = self.ogb_dataset.get_idx_split()
        self._split_array[split_idx['train']] = 0
        self._split_array[split_idx['valid']] = 1
        self._split_array[split_idx['test']] = 2

        self._y_array = self.ogb_dataset.data.y

        self._metadata_fields = ['scaffold']

        metadata_file_path = os.path.join(self.ogb_dataset.root, 'raw',
                                          'scaffold_group.npy')
        if not os.path.exists(metadata_file_path):
            download_url(
                'https://snap.stanford.edu/ogb/data/misc/ogbg_molpcba/scaffold_group.npy',
                os.path.join(self.ogb_dataset.root, 'raw'))
        self._metadata_array = torch.from_numpy(
            np.load(metadata_file_path)).reshape(-1, 1).long()

        if torch_geometric.__version__ >= '1.7.0':
            self._collate = PyGCollater(follow_batch=[], exclude_keys=[])
        else:
            self._collate = PyGCollater(follow_batch=[])

        self._metric = Evaluator('ogbg-molpcba')

        super().__init__(root_dir, download, split_scheme)
コード例 #12
0
ファイル: ogb_utils.py プロジェクト: toenshoff/CRaWl
def load_graphs(ogb_name):

    dataset = PygGraphPropPredDataset(ogb_name, root='data', transform=preproc)
    out_dim = dataset[0].y.shape[1]

    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    print("Preprocessing Graphs...")
    train_graphs = list(tqdm(dataset[train_idx]))
    train_graphs = [d for d in train_graphs if d.num_edges > 0]
    valid_graphs = list(dataset[valid_idx])
    test_graphs = list(dataset[test_idx])

    return out_dim, train_graphs, valid_graphs, test_graphs
コード例 #13
0
ファイル: ogb.py プロジェクト: Frozenmad/AutoGL
 def __init__(self, path):
     dataset = "ogbg-molpcba"
     # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
     PygGraphPropPredDataset(name=dataset, root=path)
     super(OGBGmolpcbaDataset, self).__init__(dataset, path)
     setattr(OGBGmolpcbaDataset, "metric", "AP")
     setattr(OGBGmolpcbaDataset, "loss", "binary_cross_entropy_with_logits")
コード例 #14
0
ファイル: ogb.py プロジェクト: zhuyawen/AutoGL
 def __init__(self, path):
     dataset = "ogbg-molhiv"
     # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
     PygGraphPropPredDataset(name=dataset, root=path)
     super(OGBGmolhivDataset, self).__init__(dataset, path)
     setattr(OGBGmolhivDataset, "metric", "ROC-AUC")
     setattr(OGBGmolhivDataset, "loss", "BCEWithLogitsLoss")
コード例 #15
0
 def __init__(self, args=None):
     dataset = "ogbg-molpcba"
     path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data",
                     dataset)
     if not osp.exists(path):
         PygGraphPropPredDataset(dataset, path)
     super(OGBMolpcbaDataset, self).__init__(path, dataset)
コード例 #16
0
ファイル: ogb.py プロジェクト: yunyoonaer/cogdl
 def get_loader(self, args):
     split_index = self.get_idx_split()
     dataset = PygGraphPropPredDataset(self.name, osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data"))
     train_loader = DataLoader(dataset[split_index["train"]], batch_size = args.batch_size, shuffle = True)
     valid_loader = DataLoader(dataset[split_index["valid"]], batch_size = args.batch_size, shuffle = False)
     test_loader = DataLoader(dataset[split_index["test"]], batch_size = args.batch_size, shuffle = False)
     return train_loader, valid_loader, test_loader
コード例 #17
0
ファイル: ogb.py プロジェクト: Frozenmad/AutoGL
 def __init__(self, path):
     dataset = "ogbg-code"
     # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
     PygGraphPropPredDataset(name=dataset, root=path)
     super(OGBGcodeDataset, self).__init__(dataset, path)
     setattr(OGBGcodeDataset, "metric", "F1 score")
     setattr(OGBGcodeDataset, "loss", "cross_entropy")
コード例 #18
0
ファイル: ogb.py プロジェクト: zhuyawen/AutoGL
 def __init__(self, path):
     dataset = "ogbg-ppa"
     # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
     PygGraphPropPredDataset(name=dataset, root=path)
     super(OGBGppaDataset, self).__init__(dataset, path)
     setattr(OGBGppaDataset, "metric", "Accuracy")
     setattr(OGBGppaDataset, "loss", "CrossEntropyLoss")
コード例 #19
0
 def __init__(self):
     dataset = "ogbg-code"
     path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data",
                     dataset)
     if not osp.exists(path):
         PygGraphPropPredDataset(dataset, path)
     super(OGBCodeDataset, self).__init__(path, dataset)
コード例 #20
0
    def load_data(self):

        dataset = PygGraphPropPredDataset(name = self.args.data)
        self.args.task_type, self.args.num_features, self.args.num_classes, self.args.avg_num_nodes \
            = dataset.task_type, dataset.num_features, dataset.num_tasks, np.ceil(np.mean([data.num_nodes for data in dataset]))
        print('# %s: [Task]-%s [FEATURES]-%d [NUM_CLASSES]-%d [AVG_NODES]-%d' % (dataset, self.args.task_type, self.args.num_features, self.args.num_classes, self.args.avg_num_nodes))

        return dataset
コード例 #21
0
def load_ogb(name, dataset_dir):
    if name[:4] == 'ogbn':
        dataset = PygNodePropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_idx_split()
        split_names = ['train_mask', 'val_mask', 'test_mask']
        for i, key in enumerate(splits.keys()):
            mask = index2mask(splits[key], size=dataset.data.y.shape[0])
            set_dataset_attr(dataset, split_names[i], mask, len(mask))
        edge_index = to_undirected(dataset.data.edge_index)
        set_dataset_attr(dataset, 'edge_index', edge_index,
                         edge_index.shape[1])

    elif name[:4] == 'ogbg':
        dataset = PygGraphPropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_idx_split()
        split_names = [
            'train_graph_index', 'val_graph_index', 'test_graph_index'
        ]
        for i, key in enumerate(splits.keys()):
            id = splits[key]
            set_dataset_attr(dataset, split_names[i], id, len(id))

    elif name[:4] == "ogbl":
        dataset = PygLinkPropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_edge_split()

        id = splits['train']['edge'].T
        if cfg.dataset.resample_negative:
            set_dataset_attr(dataset, 'train_pos_edge_index', id, id.shape[1])
            # todo: applying transform for negative sampling is very slow
            dataset.transform = neg_sampling_transform
        else:
            id_neg = negative_sampling(edge_index=id,
                                       num_nodes=dataset.data.num_nodes[0],
                                       num_neg_samples=id.shape[1])
            id_all = torch.cat([id, id_neg], dim=-1)
            label = get_link_label(id, id_neg)
            set_dataset_attr(dataset, 'train_edge_index', id_all,
                             id_all.shape[1])
            set_dataset_attr(dataset, 'train_edge_label', label, len(label))

        id, id_neg = splits['valid']['edge'].T, splits['valid']['edge_neg'].T
        id_all = torch.cat([id, id_neg], dim=-1)
        label = get_link_label(id, id_neg)
        set_dataset_attr(dataset, 'val_edge_index', id_all, id_all.shape[1])
        set_dataset_attr(dataset, 'val_edge_label', label, len(label))

        id, id_neg = splits['test']['edge'].T, splits['test']['edge_neg'].T
        id_all = torch.cat([id, id_neg], dim=-1)
        label = get_link_label(id, id_neg)
        set_dataset_attr(dataset, 'test_edge_index', id_all, id_all.shape[1])
        set_dataset_attr(dataset, 'test_edge_label', label, len(label))

    else:
        raise ValueError('OGB dataset: {} non-exist')

    return dataset
コード例 #22
0
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        # internally call ogb package
        self.ogb_dataset = PygGraphPropPredDataset(name='ogbg-molpcba',
                                                   root=root_dir)

        # set variables
        self._dataset_name = 'ogbg-molpcba'
        self._data_dir = self.ogb_dataset.root
        if split_scheme == 'official':
            split_scheme = 'scaffold'
        self._split_scheme = split_scheme
        self._y_type = 'float'  # although the task is binary classification, the prediction target contains nan value, thus we need float
        self._y_size = self.ogb_dataset.num_tasks
        self._n_classes = self.ogb_dataset.__num_classes__

        self._split_array = torch.zeros(len(self.ogb_dataset)).long()
        split_idx = self.ogb_dataset.get_idx_split()
        self._split_array[split_idx['train']] = 0
        self._split_array[split_idx['valid']] = 1
        self._split_array[split_idx['test']] = 2

        self._y_array = self.ogb_dataset.data.y

        self._metadata_fields = ['scaffold']

        metadata_file_path = os.path.join(self.ogb_dataset.root, 'raw',
                                          'scaffold_group.npy')
        if not os.path.exists(metadata_file_path):
            download_url('', os.path.join(self.ogb_dataset.root, 'raw'))
        self._metadata_array = torch.from_numpy(
            np.load(metadata_file_path)).reshape(-1, 1).long()
        self._collate = PyGCollater(follow_batch=[])

        self._metric = Evaluator('ogbg-molpcba')

        super().__init__(root_dir, download, split_scheme)
コード例 #23
0
 def get_loader(self, args):
     split_index = self.get_idx_split()
     dataset = PygGraphPropPredDataset(self.name,
                                       osp.join("data", self.name))
     train_loader = DataLoader(dataset[split_index["train"]],
                               batch_size=args.batch_size,
                               shuffle=True)
     valid_loader = DataLoader(dataset[split_index["valid"]],
                               batch_size=args.batch_size,
                               shuffle=False)
     test_loader = DataLoader(dataset[split_index["test"]],
                              batch_size=args.batch_size,
                              shuffle=False)
     return train_loader, valid_loader, test_loader
コード例 #24
0
ファイル: utils.py プロジェクト: jingmouren/egc
def code_data(
    root,
    batch_size=128,
    num_vocab=VOCAB_SIZE,
    seq_len=SEQ_LEN,
    use_old_code_dataset=False,
):
    dataset = PygGraphPropPredDataset(
        "ogbg-code" if use_old_code_dataset else "ogbg-code2", root=root)
    split_idx = dataset.get_idx_split()
    vocab2idx, idx2vocab = get_vocab_mapping(
        [dataset.data.y[i] for i in split_idx["train"]], num_vocab)
    dataset.transform = transforms.Compose(
        [augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, seq_len)])

    loaders = dict()
    for split in ["train", "valid", "test"]:
        loaders[split] = DataLoader(
            dataset[split_idx[split]],
            batch_size=batch_size,
            shuffle=(split == "train"),
            num_workers=2,
        )
    return loaders, idx2vocab
コード例 #25
0
def load_ogb_data(path, name, degree_as_tag):

    ### splits and preprocessing according to https://github.com/snap-stanford/ogb

    def add_zeros(data):
        data.x = torch.zeros(data.num_nodes, dtype=torch.long)
        return data

    transform = add_zeros if name == 'ogbg-ppa' else None
    print('Applying transform {} to dataset {}.'.format(transform, name))
    dataset = PygGraphPropPredDataset(name=name,
                                      root=path,
                                      transform=transform)
    Graph = namedtuple('Graph',
                       ['node_features', 'edge_mat', 'edge_features', 'label'])
    graph_list = list()
    for datum in dataset:
        graph = Graph(datum.x, datum.edge_index, datum.edge_attr, datum.y)
        graph_list.append(graph)
    num_classes = dataset.num_classes if name == 'ogbg-ppa' else dataset.num_tasks
    return graph_list, num_classes
コード例 #26
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbg-ppi data with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--num_layer',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=300,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-ppi",
                        help='dataset name (default: ogbg-ppi)')

    parser.add_argument('--filename',
                        type=str,
                        default="",
                        help='filename to output result (default: )')
    args = parser.parse_args()

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting

    dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros)

    splitted_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[splitted_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[splitted_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[splitted_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    if args.gnn == 'gin':
        model = GNN(gnn_type='gin',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False).to(device)
    elif args.gnn == 'gin-virtual':
        model = GNN(gnn_type='gin',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True).to(device)
    elif args.gnn == 'gcn':
        model = GNN(gnn_type='gcn',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False).to(device)
    elif args.gnn == 'gcn-virtual':
        model = GNN(gnn_type='gcn',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True).to(device)
    else:
        raise ValueError('Invalid GNN type')

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    valid_curve = []
    test_curve = []
    train_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        train(model, device, train_loader, optimizer)

        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })

        train_curve.append(train_perf['acc'])
        valid_curve.append(valid_perf['acc'])
        test_curve.append(test_perf['acc'])

    best_val_epoch = np.argmax(np.array(valid_curve))
    best_train = max(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    if not args.filename == '':
        torch.save(
            {
                'Val': valid_curve[best_val_epoch],
                'Test': test_curve[best_val_epoch],
                'Train': train_curve[best_val_epoch],
                'BestTrain': best_train
            }, args.filename)
コード例 #27
0
 def prepare_data(self):
     """Download data if needed. This method is called only from a single GPU.
     Do not use it to assign state (self.x = y). Pretransform is applied before saving dataset on disk."""
     PygGraphPropPredDataset(name="ogbg-molpcba",
                             root=self.data_dir,
                             pre_transform=self.pre_transform)
コード例 #28
0
from ogb.graphproppred import PygGraphPropPredDataset
import os

root_folder = '/vol/deform/gbouritsas/datasets/'

datasets = ['ogbg-molpcba', 'ogbg-molhiv', 'ogbg-ppa']

for name in datasets:
    dataset = PygGraphPropPredDataset(name=name,
                                      root=os.path.join(
                                          root_folder, 'ogb',
                                          '{}'.format(name)))
    split_idx = dataset.get_idx_split()
    for split_name in {'train', 'valid', 'test'}:
        idxs = split_idx[split_name]
        split_name = split_name if split_name is not 'valid' else 'val'
        save_folder = os.path.join(root_folder, 'ogb', '{}'.format(name),
                                   '10fold_idx')
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        with open(os.path.join(save_folder, '{}_idx-0.txt'.format(split_name)),
                  'w') as handle:
            for idx in idxs:
                handle.write('{}\n'.format(idx))
コード例 #29
0
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch.nn import BatchNorm1d, Linear, ReLU, Sequential
from torch.optim.lr_scheduler import ReduceLROnPlateau

import torch_geometric.transforms as T
from torch_geometric.loader import DataLoader
from torch_geometric.nn import EGConv, global_mean_pool

parser = argparse.ArgumentParser()
parser.add_argument('--use_multi_aggregators',
                    action='store_true',
                    help='Switch between EGC-S and EGC-M')
args = parser.parse_args()

path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'OGB')
dataset = OGBG('ogbg-molhiv', path, pre_transform=T.ToSparseTensor())
evaluator = Evaluator('ogbg-molhiv')

split_idx = dataset.get_idx_split()
train_dataset = dataset[split_idx['train']]
val_dataset = dataset[split_idx['valid']]
test_dataset = dataset[split_idx['test']]

train_loader = DataLoader(train_dataset,
                          batch_size=32,
                          num_workers=4,
                          shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)
test_loader = DataLoader(test_dataset, batch_size=256)

コード例 #30
0
def main():

    args = ArgsInit().save_exp()

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    sub_dir = 'BS_{}-NF_{}'.format(args.batch_size, args.feature)

    dataset = PygGraphPropPredDataset(name=args.dataset)
    args.num_tasks = dataset.num_tasks
    logging.info('%s' % args)

    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    evaluator = Evaluator(args.dataset)
    split_idx = dataset.get_idx_split()

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    model = DeeperGCN(args).to(device)

    logging.info(model)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    results = {
        'highest_valid': 0,
        'final_train': 0,
        'final_test': 0,
        'highest_train': 0
    }

    start_time = time.time()

    for epoch in range(1, args.epochs + 1):
        logging.info("=====Epoch {}".format(epoch))
        logging.info('Training...')

        # epoch_loss = train(model, device, train_loader, optimizer, dataset.task_type)
        epoch_loss = train_flag(model, device, train_loader, optimizer,
                                dataset.task_type, args)

        logging.info('Evaluating...')
        train_result = eval(model, device, train_loader,
                            evaluator)[dataset.eval_metric]
        valid_result = eval(model, device, valid_loader,
                            evaluator)[dataset.eval_metric]
        test_result = eval(model, device, test_loader,
                           evaluator)[dataset.eval_metric]

        logging.info({
            'Train': train_result,
            'Validation': valid_result,
            'Test': test_result
        })

        model.print_params(epoch=epoch)

        if train_result > results['highest_train']:

            results['highest_train'] = train_result

        if valid_result > results['highest_valid']:
            results['highest_valid'] = valid_result
            results['final_train'] = train_result
            results['final_test'] = test_result

            # save_ckpt(model, optimizer,
            #           round(epoch_loss, 4), epoch,
            #           args.model_save_path,
            #           sub_dir, name_post='valid_best')

    logging.info("%s" % results)

    end_time = time.time()
    total_time = end_time - start_time
    logging.info('Total time: {}'.format(
        time.strftime('%H:%M:%S', time.gmtime(total_time))))