Example #1
0
    def __init__(self, args):

        super(Trainer, self).__init__()

        # Random Seed
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(args.seed)
            torch.cuda.manual_seed_all(args.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        self.args = args
        self.exp_name = self.set_experiment_name()

        self.use_cuda = args.gpu >= 0 and torch.cuda.is_available()
        if self.use_cuda:
            torch.cuda.set_device(args.gpu)
            self.args.device = 'cuda:{}'.format(args.gpu)
        else:
            self.args.device = 'cpu'

        self.dataset = self.load_data()

        self.evaluator = Evaluator(args.data)
def evaluate_network_sparse(model, device, data_loader, epoch):
    model.eval()
    epoch_test_loss = 0
    epoch_test_ROC = 0
    with torch.no_grad():
        list_scores = []
        list_labels = []
        for iter, (batch_graphs, batch_labels, batch_snorm_n,
                   batch_snorm_e) in enumerate(data_loader):
            batch_x = batch_graphs.ndata['feat'].to(device)
            batch_e = batch_graphs.edata['feat'].to(device)
            batch_snorm_e = batch_snorm_e.to(device)
            batch_snorm_n = batch_snorm_n.to(device)
            batch_labels = batch_labels.to(device)
            batch_graphs = batch_graphs.to(device)
            batch_scores = model.forward(batch_graphs, batch_x, batch_e,
                                         batch_snorm_n, batch_snorm_e)
            loss = model.loss(batch_scores, batch_labels)
            epoch_test_loss += loss.detach().item()
            list_scores.append(batch_scores.detach())
            list_labels.append(batch_labels.detach().unsqueeze(-1))

        epoch_test_loss /= (iter + 1)
        evaluator = Evaluator(name='ogbg-molhiv')
        epoch_test_ROC = evaluator.eval({
            'y_pred': torch.cat(list_scores),
            'y_true': torch.cat(list_labels)
        })['rocauc']

    return epoch_test_loss, epoch_test_ROC
def train_epoch_sparse(model, optimizer, device, data_loader, epoch):
    model.train()
    epoch_loss = 0
    epoch_train_ROC = 0
    list_scores = []
    list_labels = []
    for iter, (batch_graphs, batch_labels, batch_snorm_n,
               batch_snorm_e) in enumerate(data_loader):
        batch_x = batch_graphs.ndata['feat'].to(device)  # num x feat
        batch_e = batch_graphs.edata['feat'].to(device)
        batch_snorm_e = batch_snorm_e.to(device)
        batch_snorm_n = batch_snorm_n.to(device)
        batch_labels = batch_labels.to(device)
        batch_graphs = batch_graphs.to(device)
        optimizer.zero_grad()
        batch_scores = model.forward(batch_graphs, batch_x, batch_e,
                                     batch_snorm_n, batch_snorm_e)
        loss = model.loss(batch_scores, batch_labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.detach().item()
        list_scores.append(batch_scores.detach())
        list_labels.append(batch_labels.detach().unsqueeze(-1))

    epoch_loss /= (iter + 1)
    evaluator = Evaluator(name='ogbg-molhiv')
    epoch_train_ROC = evaluator.eval({
        'y_pred': torch.cat(list_scores),
        'y_true': torch.cat(list_labels)
    })['rocauc']

    return epoch_loss, epoch_train_ROC, optimizer
    def __init__(self,
                 version=None,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._version = version
        if version is not None:
            raise ValueError(
                'Versioning for OGB-MolPCBA is handled through the OGB package. Please set version=none.'
            )
        # internally call ogb package
        self.ogb_dataset = PygGraphPropPredDataset(name='ogbg-molpcba',
                                                   root=root_dir)

        # set variables
        self._data_dir = self.ogb_dataset.root
        if split_scheme == 'official':
            split_scheme = 'scaffold'
        self._split_scheme = split_scheme
        self._y_type = 'float'  # although the task is binary classification, the prediction target contains nan value, thus we need float
        self._y_size = self.ogb_dataset.num_tasks
        self._n_classes = self.ogb_dataset.__num_classes__

        self._split_array = torch.zeros(len(self.ogb_dataset)).long()
        split_idx = self.ogb_dataset.get_idx_split()
        self._split_array[split_idx['train']] = 0
        self._split_array[split_idx['valid']] = 1
        self._split_array[split_idx['test']] = 2

        self._y_array = self.ogb_dataset.data.y

        self._metadata_fields = ['scaffold']

        metadata_file_path = os.path.join(self.ogb_dataset.root, 'raw',
                                          'scaffold_group.npy')
        if not os.path.exists(metadata_file_path):
            download_url(
                'https://snap.stanford.edu/ogb/data/misc/ogbg_molpcba/scaffold_group.npy',
                os.path.join(self.ogb_dataset.root, 'raw'))
        self._metadata_array = torch.from_numpy(
            np.load(metadata_file_path)).reshape(-1, 1).long()

        if torch_geometric.__version__ >= '1.7.0':
            self._collate = PyGCollater(follow_batch=[], exclude_keys=[])
        else:
            self._collate = PyGCollater(follow_batch=[])

        self._metric = Evaluator('ogbg-molpcba')

        super().__init__(root_dir, download, split_scheme)
Example #5
0
def test(loader):
    model.eval()
    evaluator = Evaluator(name='ogbg-molhiv')
    list_pred = []
    list_labels = []
    for data in loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, None, data.batch)
        list_pred.append(out)
        list_labels.append(data.y)
    epoch_test_ROC = evaluator.eval({
        'y_pred': torch.cat(list_pred),
        'y_true': torch.cat(list_labels)
    })['rocauc']
    return epoch_test_ROC
def train_epoch_sparse(model, optimizer, device, data_loader, epoch,
                       distortion):
    model.train()
    epoch_loss = 0
    epoch_train_ROC = 0
    list_scores = []
    list_labels = []
    for iter, (batch_graphs, batch_labels, batch_snorm_n,
               batch_snorm_e) in enumerate(data_loader):
        batch_x = batch_graphs.ndata['feat'].to(device)  # num x feat
        batch_e = batch_graphs.edata['feat'].to(device)
        batch_snorm_e = batch_snorm_e.to(device)
        batch_snorm_n = batch_snorm_n.to(device)
        batch_labels = batch_labels.to(device)
        if distortion > 1e-7:
            batch_graphs_eig = batch_graphs.ndata['eig'].clone()
            dist = (torch.rand(batch_x[:, 0].shape) - 0.5) * 2 * distortion
            batch_graphs.ndata['eig'][:, 1] = torch.mul(
                dist,
                torch.mean(torch.abs(batch_graphs_eig[:, 1]),
                           dim=-1,
                           keepdim=True)) + batch_graphs_eig[:, 1]
            batch_graphs.ndata['eig'][:, 2] = torch.mul(
                dist,
                torch.mean(torch.abs(batch_graphs_eig[:, 2]),
                           dim=-1,
                           keepdim=True)) + batch_graphs_eig[:, 2]

        optimizer.zero_grad()
        batch_scores = model.forward(batch_graphs, batch_x, batch_e,
                                     batch_snorm_n, batch_snorm_e)
        loss = model.loss(batch_scores, batch_labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.detach().item()
        list_scores.append(batch_scores.detach())
        list_labels.append(batch_labels.detach().unsqueeze(-1))
        if distortion > 1e-7:
            batch_graphs.ndata['eig'] = batch_graphs_eig.detach()

    epoch_loss /= (iter + 1)
    evaluator = Evaluator(name='ogbg-molhiv')
    epoch_train_ROC = evaluator.eval({
        'y_pred': torch.cat(list_scores),
        'y_true': torch.cat(list_labels)
    })['rocauc']

    return epoch_loss, epoch_train_ROC, optimizer
Example #7
0
def get_data(name, batch_size, rwr=False, cleaned=False):
    if name == 'ogbg-molhiv':
        data_train, data_val, data_test, max_num_nodes = get_molhiv()
        num_classes = 2
    elif name == 'ZINC':
        data_train, data_val, data_test = get_mod_zinc(rwr)
        max_num_nodes = 37
        num_classes = 1
    elif name == 'SMNIST':
        data_train, data_val, data_test = get_smnist(rwr)
        max_num_nodes = 75
        num_classes = 10
    else:
        data = get_tudataset(name, rwr, cleaned=cleaned)
        num_classes = data.num_classes
        max_num_nodes = 0
        for d in data:
            max_num_nodes = max(d.num_nodes, max_num_nodes)
        data_train, data_val, data_test = data_split(data)

    stats = dict()
    stats['num_features'] = data_train.num_node_features
    stats['num_classes'] = num_classes
    stats['max_num_nodes'] = max_num_nodes

    evaluator, encode_edge = (Evaluator(name),
                              True) if name == 'ogbg-molhiv' else (None, False)

    train_loader = DataLoader(data_train, batch_size, shuffle=True)
    val_loader = DataLoader(data_val, batch_size, shuffle=False)
    test_loader = DataLoader(data_test, batch_size, shuffle=False)

    return train_loader, val_loader, test_loader, stats, evaluator, encode_edge
Example #8
0
    def __init__(self, name, pos_enc_dim=0, norm='none', verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
        self.split_idx = self.dataset.get_idx_split()

        self.train = HIVDGL(self.dataset,
                            self.split_idx['train'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim)
        self.val = HIVDGL(self.dataset,
                          self.split_idx['valid'],
                          norm=norm,
                          pos_enc_dim=pos_enc_dim)
        self.test = HIVDGL(self.dataset,
                           self.split_idx['test'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim)

        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))
Example #9
0
    def __init__(self, criterion=torch.nn.BCEWithLogitsLoss(pos_weight = torch.tensor([30]))):
        super().__init__()

        # loading params
        with open('parameters.json') as json_file:
            parameters = json.load(json_file)
        self.configuration = parameters

       
        self.save_hyperparameters(
            dict(
                batch_size = parameters["batch_size"],
                lr=parameters["learning_rate"],
                weight_decay=parameters["weight_decay"],
                num_workers=parameters["num_workers"],
                criterion=criterion,
                epochs=parameters["epochs"],
            )
        )
        
        self._train_data = None
        self._collate_fn = None
        self._train_loader = None

        self.batch_size = self.configuration["batch_size"]
        self.num_workers = self.configuration["num_workers"]
        
        self.lr = self.configuration["learning_rate"]
        self.epochs=self.configuration["epochs"]
        
        self.weight_decay = self.configuration["weight_decay"]
        self.criterion = criterion

        self.evaluator = Evaluator(parameters["dataset_name"])
Example #10
0
    def __init__(self, name, pos_enc_dim=0, norm='none', verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.norm = norm
        dataset = DownloadPCBA(name='ogbg-molpcba')
        split_idx = dataset.get_idx_split()
        self.train = PCBADGL(dataset,
                             split_idx['train'],
                             norm=norm,
                             pos_enc_dim=pos_enc_dim)
        self.val = PCBADGL(dataset,
                           split_idx['valid'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim)
        self.test = PCBADGL(dataset,
                            split_idx['test'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim)
        del dataset
        del split_idx

        self.evaluator = Evaluator(name='ogbg-molpcba')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))
Example #11
0
def task_data(args, dataset=None):

    # DATA_ROOT = '/mnt/localdata/users/shengjie/ogb_ws/data/dataset'
    # step 0: setting for gpu
    if args.gpu >= 0:
        torch.cuda.set_device(args.gpu)
    # step 1: prepare dataset
    if dataset is None:
        dataset = DglGraphPropPredDataset(name=args.dataset,
                                          root=args.data_dir)
    splitted_idx = dataset.get_idx_split()

    # step 2: prepare data_loader

    train_loader = DataLoader(dataset[splitted_idx['train']],
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_dgl,
                              num_workers=4)
    valid_loader = DataLoader(dataset[splitted_idx['valid']],
                              batch_size=args.batch_size,
                              shuffle=False,
                              collate_fn=collate_dgl,
                              num_workers=4)
    test_loader = DataLoader(dataset[splitted_idx['test']],
                             batch_size=args.batch_size,
                             shuffle=False,
                             collate_fn=collate_dgl,
                             num_workers=4)

    evaluator = Evaluator(args.dataset)

    return dataset, evaluator, train_loader, valid_loader, test_loader
Example #12
0
    def __init__(self,
                 architecture: str = "GCN",
                 num_node_features: int = 300,
                 activation: str = "prelu",
                 num_conv_layers: int = 3,
                 conv_size: int = 256,
                 pool_method: str = "add",
                 lin1_size: int = 128,
                 lin2_size: int = 64,
                 output_size: int = 128,
                 lr: float = 0.001,
                 weight_decay: float = 0,
                 **kwargs):
        super().__init__()

        # this line ensures params passed to LightningModule will be saved to ckpt
        # it also allows to access params with 'self.hparams' attribute
        self.save_hyperparameters(logger=False)

        # init node embedding layer
        self.atom_encoder = AtomEncoder(emb_dim=self.hparams.num_node_features)
        # self.bond_encoder = BondEncoder(emb_dim=self.hparams.edge_emb_size)

        # init network architecture
        if self.hparams.architecture == "GCN":
            self.model = gcn.GCN(hparams=self.hparams)
        elif self.hparams.architecture == "GAT":
            self.model = gat.GAT(hparams=self.hparams)
        elif self.hparams.architecture == "GraphSAGE":
            self.model = graph_sage.GraphSAGE(hparams=self.hparams)
        elif self.hparams.architecture == "GIN":
            self.model = gin.GIN(hparams=self.hparams)
        else:
            raise Exception("Incorrect architecture name!")

        # loss function
        self.criterion = torch.nn.BCEWithLogitsLoss()

        # metric
        self.evaluator = Evaluator(name="ogbg-molpcba")

        self.metric_hist = {
            "train/ap": [],
            "val/ap": [],
            "train/loss": [],
            "val/loss": [],
        }
Example #13
0
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygGraphPropPredDataset(name=args.dataset)
    args.num_tasks = dataset.num_tasks
    print(args)

    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    model = DeeperGCN(args)

    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    train_result = eval(model, device, train_loader,
                        evaluator)[dataset.eval_metric]
    valid_result = eval(model, device, valid_loader,
                        evaluator)[dataset.eval_metric]
    test_result = eval(model, device, test_loader,
                       evaluator)[dataset.eval_metric]

    print({
        'Train': train_result,
        'Validation': valid_result,
        'Test': test_result
    })

    model.print_params(final=True)
Example #14
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='GIN with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-mol-tox21",
                        help='dataset name (default: ogbg-mol-tox21)')
    args = parser.parse_args()

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting
    dataset = PygGraphPropPredDataset(name=args.dataset)
    splitted_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[splitted_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[splitted_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[splitted_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    model = GIN(num_task=dataset.num_tasks).to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(1, args.epochs + 1):
        train(model, device, train_loader, optimizer)
        #print("Evaluating training...")
        #print(eval(model, device, train_loader, evaluator))
        print("Evaluating validation:")
        print(eval(model, device, valid_loader, evaluator))
def train_epoch(model, optimizer, device, data_loader, epoch):
    model.train()
    epoch_loss = 0
    epoch_train_AP = 0
    list_scores = []
    list_labels = []
    for iter, (batch_graphs, batch_targets) in enumerate(data_loader):
        batch_graphs = batch_graphs.to(device)
        batch_x = batch_graphs.ndata['feat'].to(device)  # num x feat
        batch_e = batch_graphs.edata['feat'].to(device)
        batch_targets = batch_targets.to(device)
        optimizer.zero_grad()
        try:
            batch_lap_pos_enc = batch_graphs.ndata['lap_pos_enc'].to(device)
            sign_flip = torch.rand(batch_lap_pos_enc.size(1)).to(device)
            sign_flip[sign_flip >= 0.5] = 1.0
            sign_flip[sign_flip < 0.5] = -1.0
            batch_lap_pos_enc = batch_lap_pos_enc * sign_flip.unsqueeze(0)
        except:
            batch_lap_pos_enc = None

        try:
            batch_wl_pos_enc = batch_graphs.ndata['wl_pos_enc'].to(device)
        except:
            batch_wl_pos_enc = None

        batch_scores = model.forward(batch_graphs, batch_x, batch_e,
                                     batch_lap_pos_enc, batch_wl_pos_enc)
        is_labeled = batch_targets == batch_targets
        loss = model.loss(batch_scores[is_labeled],
                          batch_targets.float()[is_labeled])
        loss.backward()
        optimizer.step()
        epoch_loss += loss.detach().item()
        list_scores.append(batch_scores.detach().cpu())
        list_labels.append(batch_targets.detach().cpu())

    epoch_loss /= (iter + 1)
    evaluator = Evaluator(name='ogbg-molpcba')
    epoch_train_AP = evaluator.eval({
        'y_pred': torch.cat(list_scores),
        'y_true': torch.cat(list_labels)
    })['ap']

    return epoch_loss, epoch_train_AP, optimizer
Example #16
0
 def eval_on(self, loader, trainer):
     results_dict = super().eval_on(loader, trainer)
     evaluator = GraphPropEvaluator(name=self.task_name)
     y_trues = []
     y_preds = []
     for batch in loader:
         if trainer.on_gpu:
             batch = batch.to("cuda")
         y_preds.append(self.model(batch).cpu().detach().numpy())
         y_trues.append(batch.y.cpu().detach().numpy())
     y_trues = np.concatenate(y_trues, axis=0)
     y_preds = np.concatenate(y_preds, axis=0)
     results_dict.update(
         evaluator.eval({
             "y_true": y_trues,
             "y_pred": y_preds
         }))
     return results_dict
Example #17
0
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    if args.not_extract_node_feature:
        dataset = PygGraphPropPredDataset(name=args.dataset,
                                          transform=add_zeros)
    else:
        extract_node_feature_func = partial(extract_node_feature,
                                            reduce=args.aggr)
        dataset = PygGraphPropPredDataset(name=args.dataset,
                                          transform=extract_node_feature_func)

    args.num_tasks = dataset.num_classes
    evaluator = Evaluator(args.dataset)

    split_idx = dataset.get_idx_split()

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    print(args)

    model = DeeperGCN(args)
    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    train_accuracy = eval(model, device, train_loader, evaluator)
    valid_accuracy = eval(model, device, valid_loader, evaluator)
    test_accuracy = eval(model, device, test_loader, evaluator)

    print({
        'Train': train_accuracy,
        'Validation': valid_accuracy,
        'Test': test_accuracy
    })
    model.print_params(final=True)
Example #18
0
 def __init__(self, train):
     super(Mol_pred_DNN_dataset, self).__init__()
     self.train = train
     dataset_name = 'ogbg-molhiv'
     mol_origin_dataset = PygGraphPropPredDataset(name=dataset_name)
     evaluator = Evaluator(name=dataset_name)
     split_idx = mol_origin_dataset.get_idx_split()
     if self.train == True:
         self.mol_origin_dataset = mol_origin_dataset[split_idx["train"]]
     else:
         self.mol_origin_dataset = mol_origin_dataset[split_idx["test"]]
Example #19
0
def mol_pred_GNN_prepare(batch_size=50):
    dataset_name = 'ogbg-molhiv'

    dataset = PygGraphPropPredDataset(name=dataset_name)
    evaluator = Evaluator(name=dataset_name)

    split_idx = dataset.get_idx_split()
    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=batch_size, shuffle=False)
    return train_loader, test_loader
def evaluate_network(model, device, data_loader, epoch):
    model.eval()
    epoch_test_loss = 0
    epoch_test_AP = 0
    with torch.no_grad():
        list_scores = []
        list_labels = []
        for iter, (batch_graphs, batch_targets) in enumerate(data_loader):
            batch_graphs = batch_graphs.to(device)
            batch_x = batch_graphs.ndata['feat'].to(device)
            batch_e = batch_graphs.edata['feat'].to(device)
            batch_targets = batch_targets.to(device)
            try:
                batch_lap_pos_enc = batch_graphs.ndata['lap_pos_enc'].to(
                    device)
            except:
                batch_lap_pos_enc = None

            try:
                batch_wl_pos_enc = batch_graphs.ndata['wl_pos_enc'].to(device)
            except:
                batch_wl_pos_enc = None

            batch_scores = model.forward(batch_graphs, batch_x, batch_e,
                                         batch_lap_pos_enc, batch_wl_pos_enc)
            is_labeled = batch_targets == batch_targets
            loss = model.loss(batch_scores[is_labeled],
                              batch_targets.float()[is_labeled])
            epoch_test_loss += loss.detach().item()
            list_scores.append(batch_scores.detach().cpu())
            list_labels.append(batch_targets.detach().cpu())

        epoch_test_loss /= (iter + 1)
        evaluator = Evaluator(name='ogbg-molpcba')
        epoch_test_AP = evaluator.eval({
            'y_pred': torch.cat(list_scores),
            'y_true': torch.cat(list_labels)
        })['ap']

    return epoch_test_loss, epoch_test_AP
Example #21
0
    def __init__(self,
                 name,
                 pos_enc_dim=0,
                 norm='none',
                 path='dataset/ogbg-molhiv',
                 directions=['subgraphs'],
                 verbose=True,
                 **subgraph_params):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name

        ##### MODIFIED CODE HERE
        if 'subgraphs' in directions:
            self.dataset, self.split_idx = prepare_dataset(
                path, name, **subgraph_params)
            print("One hot encoding substructure counts... ", end='')
            self.dataset, self.d_id = encode(self.dataset,
                                             subgraph_params['id_encoding'])
        else:
            self.dataset = DglGraphPropPredDataset(name=name, root=path)
            self.split_idx = self.dataset.get_idx_split()
            self.d_id = None

        self.train = HIVDGL(self.dataset,
                            self.split_idx['train'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim,
                            directions=directions,
                            **subgraph_params)
        self.val = HIVDGL(self.dataset,
                          self.split_idx['valid'],
                          norm=norm,
                          pos_enc_dim=pos_enc_dim,
                          directions=directions,
                          **subgraph_params)
        self.test = HIVDGL(self.dataset,
                           self.split_idx['test'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim,
                           directions=directions,
                           **subgraph_params)
        ##### MODIFIED CODE HERE
        #import pdb;pdb.set_trace()
        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        # internally call ogb package
        self.ogb_dataset = PygGraphPropPredDataset(name='ogbg-molpcba',
                                                   root=root_dir)

        # set variables
        self._dataset_name = 'ogbg-molpcba'
        self._data_dir = self.ogb_dataset.root
        if split_scheme == 'official':
            split_scheme = 'scaffold'
        self._split_scheme = split_scheme
        self._y_type = 'float'  # although the task is binary classification, the prediction target contains nan value, thus we need float
        self._y_size = self.ogb_dataset.num_tasks
        self._n_classes = self.ogb_dataset.__num_classes__

        self._split_array = torch.zeros(len(self.ogb_dataset)).long()
        split_idx = self.ogb_dataset.get_idx_split()
        self._split_array[split_idx['train']] = 0
        self._split_array[split_idx['valid']] = 1
        self._split_array[split_idx['test']] = 2

        self._y_array = self.ogb_dataset.data.y

        self._metadata_fields = ['scaffold']

        metadata_file_path = os.path.join(self.ogb_dataset.root, 'raw',
                                          'scaffold_group.npy')
        if not os.path.exists(metadata_file_path):
            download_url('', os.path.join(self.ogb_dataset.root, 'raw'))
        self._metadata_array = torch.from_numpy(
            np.load(metadata_file_path)).reshape(-1, 1).long()
        self._collate = PyGCollater(follow_batch=[])

        self._metric = Evaluator('ogbg-molpcba')

        super().__init__(root_dir, download, split_scheme)
Example #23
0
    def __init__(self, name):
        start = time.time()
        print("[I] Loading dataset %s..." % (name))
        self.name = name
        dataset = DownloadPCBA(name='ogbg-molpcba')
        split_idx = dataset.get_idx_split()
        self.train = PCBADGL(dataset, split_idx['train'])
        self.val = PCBADGL(dataset, split_idx['valid'])
        self.test = PCBADGL(dataset, split_idx['test'])
        del dataset
        del split_idx

        self.evaluator = Evaluator(name='ogbg-molpcba')

        print('train, test, val sizes :', len(self.train), len(self.test),
              len(self.val))
        print("[I] Finished loading.")
        print("[I] Data load time: {:.4f}s".format(time.time() - start))
def get_data(name, batch_size):
    if name == "ogbg-molhiv":
        data_train, data_val, data_test, max_num_nodes = get_molhiv()
        num_classes = 2
    else:
        raise ValueError("dataset not supported")

    stats = dict()
    stats["num_features"] = data_train.num_node_features
    stats["num_classes"] = num_classes
    stats["max_num_nodes"] = max_num_nodes

    evaluator = Evaluator(name)
    encode_edge = True
    train_loader = DataLoader(data_train, batch_size, shuffle=True)
    val_loader = DataLoader(data_val, batch_size, shuffle=False)
    test_loader = DataLoader(data_test, batch_size, shuffle=False)

    return train_loader, val_loader, test_loader, stats, evaluator, encode_edge
Example #25
0
    def __init__(self,
                 name,
                 re_split=False,
                 pos_enc_dim=0,
                 norm='none',
                 verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
        self.split_idx = self.dataset.get_idx_split()
        if re_split:
            ind = [i for i in range(41127)]
            rd.shuffle(ind)
            self.split_idx = {
                'test': torch.tensor([ind[i] for i in range(36564, 41127)]),
                'train': torch.tensor([ind[i] for i in range(32000)]),
                'valid': torch.tensor([ind[i] for i in range(32000, 36564)])
            }

        self.train = HIVDGL(self.dataset,
                            self.split_idx['train'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim)
        self.val = HIVDGL(self.dataset,
                          self.split_idx['valid'],
                          norm=norm,
                          pos_enc_dim=pos_enc_dim)
        self.test = HIVDGL(self.dataset,
                           self.split_idx['test'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim)

        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))
Example #26
0
def run(args):
    from ogb.graphproppred import DglGraphPropPredDataset, Evaluator, collate_dgl
    from torch.utils.data import DataLoader

    dataset = DglGraphPropPredDataset(name="ogbg-molhiv")

    import os
    if not os.path.exists("heterographs.bin"):
        dataset.graphs = [hpno.heterograph(graph) for graph in dataset.graphs]
        from dgl.data.utils import save_graphs
        save_graphs("heterographs.bin", dataset.graphs)
    else:
        from dgl.data.utils import load_graphs
        dataset.graphs = load_graphs("heterographs.bin")[0]

    evaluator = Evaluator(name="ogbg-molhiv")
    in_features = 9
    out_features = 1

    split_idx = dataset.get_idx_split()
    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=128, drop_last=True, shuffle=True, collate_fn=collate_dgl)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=len(split_idx["valid"]), shuffle=False, collate_fn=collate_dgl)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=len(split_idx["test"]), shuffle=False, collate_fn=collate_dgl)

    model = hpno.HierarchicalPathNetwork(
        in_features=in_features,
        out_features=args.hidden_features,
        hidden_features=args.hidden_features,
        depth=args.depth,
        readout=hpno.GraphReadout(
            in_features=args.hidden_features,
            out_features=out_features,
            hidden_features=args.hidden_features,
        )
    )


    if torch.cuda.is_available():
        model = model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", factor=0.5, patience=20)

    for idx_epoch in range(args.n_epochs):
        print(idx_epoch, flush=True)
        model.train()
        for g, y in train_loader:
            y = y.float()
            if torch.cuda.is_available():
                g = g.to("cuda:0")
                y = y.cuda()
            optimizer.zero_grad()
            y_hat = model.forward(g, g.nodes['n1'].data["feat"].float())
            loss = torch.nn.BCELoss()(
                input=y_hat.sigmoid(),
                target=y,
            )
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            g, y = next(iter(valid_loader))
            y = y.float()
            if torch.cuda.is_available():
                g = g.to("cuda:0")
                y = y.cuda()
            y_hat = model.forward(g, g.nodes['n1'].data["feat"].float())
            loss = torch.nn.BCELoss()(
                input=y_hat.sigmoid(),
                target=y,
            )
            scheduler.step(loss)

        if optimizer.param_groups[0]["lr"] <= 0.01 * args.learning_rate: break

    model = model.cpu()
    g, y = next(iter(valid_loader))
    rocauc_vl = evaluator.eval(
        {
            "y_true": y.float(),
            "y_pred": model.forward(g, g.nodes['n1'].data["feat"].float()).sigmoid()
        }
    )["rocauc"]

    g, y = next(iter(test_loader))
    rocauc_te = evaluator.eval(
        {
            "y_true": y.float(),
            "y_pred": model.forward(g, g.nodes['n1'].data["feat"].float()).sigmoid()
        }
    )["rocauc"]

    import pandas as pd
    df = pd.DataFrame(
        {
            args.data: {
                "rocauc_te": rocauc_te,
                "rocauc_vl": rocauc_vl,
            }
        }
    )

    df.to_csv("%s.csv" % args.out)
Example #27
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbg-ppi data with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--num_layer',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=300,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-ppi",
                        help='dataset name (default: ogbg-ppi)')

    parser.add_argument('--filename',
                        type=str,
                        default="",
                        help='filename to output result (default: )')
    args = parser.parse_args()

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting

    dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros)

    splitted_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[splitted_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[splitted_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[splitted_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    if args.gnn == 'gin':
        model = GNN(gnn_type='gin',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False).to(device)
    elif args.gnn == 'gin-virtual':
        model = GNN(gnn_type='gin',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True).to(device)
    elif args.gnn == 'gcn':
        model = GNN(gnn_type='gcn',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False).to(device)
    elif args.gnn == 'gcn-virtual':
        model = GNN(gnn_type='gcn',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True).to(device)
    else:
        raise ValueError('Invalid GNN type')

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    valid_curve = []
    test_curve = []
    train_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        train(model, device, train_loader, optimizer)

        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })

        train_curve.append(train_perf['acc'])
        valid_curve.append(valid_perf['acc'])
        test_curve.append(test_perf['acc'])

    best_val_epoch = np.argmax(np.array(valid_curve))
    best_train = max(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    if not args.filename == '':
        torch.save(
            {
                'Val': valid_curve[best_val_epoch],
                'Test': test_curve[best_val_epoch],
                'Train': train_curve[best_val_epoch],
                'BestTrain': best_train
            }, args.filename)

step = loss = 0
for batch in loader_tr:
    step += 1
    loss += train_step(*batch)
    if step == loader_tr.steps_per_epoch:
        step = 0
        print("Loss: {}".format(loss / loader_tr.steps_per_epoch))
        loss = 0

################################################################################
# Evaluate model
################################################################################
print("Testing model")
evaluator = Evaluator(name=dataset_name)
y_true = []
y_pred = []
for batch in loader_te:
    inputs, target = batch
    p = model(inputs, training=False)
    y_true.append(target)
    y_pred.append(p.numpy())

y_true = np.vstack(y_true)
y_pred = np.vstack(y_pred)
model_loss = loss_fn(y_true, y_pred)
ogb_score = evaluator.eval({"y_true": y_true, "y_pred": y_pred})

print(
    "Done. Test loss: {:.4f}. ROC-AUC: {:.2f}".format(model_loss, ogb_score["rocauc"])
Example #29
0
def main(_):
    tf.keras.mixed_precision.set_global_policy("float16" if FLAGS.dtype == 'float16' else "float32")

    dset_name = 'ogbg-molhiv'
    dataset = GraphPropPredDataset(name=dset_name, )
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in train_idx], shuffle=True)
    val_ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in valid_idx], shuffle=False)
    strategy = xpu.configure_and_get_strategy()

    if FLAGS.total_batch_size is not None:
        gradient_accumulation_factor = FLAGS.total_batch_size // FLAGS.batch_size
    else:
        gradient_accumulation_factor = 1

    # pre-calculated number of steps per epoch (note: will vary somewhat for training, due to packing,
    #  but is found to be fairly consistent)
    steps = {
        32: (1195, 162, 148),
        64: (585, 80, 73),
        128: (288, 40, 37),
        256: (143, 20, 18)
    }
    try:
        steps_per_epoch, val_steps_per_epoch, test_steps_per_epoch = steps[FLAGS.batch_size]
    except KeyError:
        print("Batch size should have the number of steps defined")
        raise KeyError()

    # need the steps per epoch to be divisible by the gradient accumulation factor
    steps_per_epoch = gradient_accumulation_factor * (steps_per_epoch // gradient_accumulation_factor)

    # we apply a linear scaling rule for learning rate with batch size, which we benchmark against BS=128
    batch_size = FLAGS.total_batch_size or FLAGS.batch_size
    lr = FLAGS.lr * batch_size / 128

    with strategy.scope():
        model = create_model()
        utils.print_trainable_variables(model)

        losses = tf.keras.losses.BinaryCrossentropy()
        if FLAGS.opt.lower() == 'sgd':
            opt = tf.keras.optimizers.SGD(learning_rate=lr)
        elif FLAGS.opt.lower() == 'adam':
            opt = tf.keras.optimizers.Adam(learning_rate=lr)
        else:
            raise NotImplementedError()

        callbacks = []

        if not os.path.isdir(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        # randomly named directory
        model_dir = os.path.join(FLAGS.model_dir, str(uuid.uuid4()))

        print(f"Saving weights to {model_dir}")
        model_path = os.path.join(model_dir, 'model')

        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            model_path, monitor="val_loss", verbose=1, save_best_only=True,
            save_weights_only=True, mode="min", save_freq="epoch")
        )

        callbacks.append(ThroughputCallback(
            samples_per_epoch=steps_per_epoch * FLAGS.batch_size * gradient_accumulation_factor))
        if FLAGS.reduce_lr_on_plateau_patience > 0:
            callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss', mode='min', factor=FLAGS.reduce_lr_on_plateau_factor,
                patience=FLAGS.reduce_lr_on_plateau_patience, min_lr=1e-8, verbose=1)
            )

        if FLAGS.early_stopping_patience > 0:
            print(f"Training will stop early after {FLAGS.early_stopping_patience} epochs without improvement.")
            callbacks.append(
                tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss', min_delta=0, patience=FLAGS.early_stopping_patience,
                    verbose=1, mode='min', baseline=None, restore_best_weights=False)
            )

        # weighted metrics are used because of the batch packing
        model.compile(optimizer=opt, loss=losses,
                      weighted_metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()],
                      steps_per_execution=steps_per_epoch)

        # if the total batch size exceeds the compute batch size
        model.set_gradient_accumulation_options(gradient_accumulation_steps_per_replica=gradient_accumulation_factor)

        model.fit(ds,
                  steps_per_epoch=steps_per_epoch,
                  epochs=FLAGS.epochs,
                  validation_data=val_ds,
                  validation_steps=val_steps_per_epoch,
                  callbacks=callbacks
                  )

        # we will use the official AUC evaluator from the OGB repo, not the keras one
        model.load_weights(model_path)
        print("Loaded best validation weights for evaluation")

        evaluator = Evaluator(name='ogbg-molhiv')
        for test_or_val, idx, steps in zip(
                ('validation', 'test'),
                (valid_idx, test_idx),
                (val_steps_per_epoch, test_steps_per_epoch)):
            prediction, ground_truth = get_predictions(model, dataset, idx, steps)
            result = evaluator.eval({'y_true': ground_truth[:, None], 'y_pred': prediction[:, None]})

            print(f'Final {test_or_val} ROC-AUC {result["rocauc"]:.3f}')
Example #30
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-MolHiv')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--num_layers', type=int, default=5)
    parser.add_argument('--emb_dim', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--eval',
                        action='store_true',
                        help='If not set, we will only do the training part.')
    parser.add_argument('--eval_batch_size', type=int, default=2048)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(name='ogbg-molhiv')
    train_loader = GraphDataLoader(dataset[split_idx["train"]],
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers)
    val_loader = GraphDataLoader(dataset[split_idx["valid"]],
                                 batch_size=args.eval_batch_size,
                                 shuffle=True,
                                 num_workers=0)
    test_loader = GraphDataLoader(dataset[split_idx["test"]],
                                  batch_size=args.eval_batch_size,
                                  shuffle=True,
                                  num_workers=0)

    model = GCN(args.emb_dim,
                num_classes=dataset.num_tasks,
                num_layers=args.num_layers,
                dropout=args.dropout).to(device)

    logger = Logger(args.runs, args)
    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            t0 = time.time()
            loss = train(model, device, train_loader, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))

            if not args.eval:
                continue

            val_rocauc = test(model, device, val_loader,
                              evaluator)[dataset.eval_metric]
            test_rocauc = test(model, device, test_loader,
                               evaluator)[dataset.eval_metric]
            logger.add_result(run, (0.0, val_rocauc, test_rocauc))

            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Valid: {val_rocauc:.4f} '
                      f'Test: {test_rocauc:.4f}')

        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()