Ejemplo n.º 1
0
    def __init__(self, config, device=None):
        if device is None:
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = config['device']

        self.model = MyModel(num_feats=config['num_feats'],
                             output_dim=config['num_feats'],
                             hidden_size=config['hidden_size'],
                             num_layers=config['num_layers'],
                             seq_len=config['X_len'],
                             horizon=config['Y_len'],
                             device=self.device,
                             bidirectional=bool(config['bidirectional'])).to(
                                 self.device)
        self.optimizer = torch.optim.AdamW(self.model.parameters(),
                                           lr=config["lr"])
        self.criterion = MyLoss(num_feats=config['num_feats'],
                                loss_type=config['loss_type']).to(self.device)
        #学习率计划√
        # Scheduler https://arxiv.org/pdf/1812.01187.pdf
        epochs = config['epochs']
        # lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.95 + 0.05  # 先用了个适用于image classification的lr函数
        # self.scheduler = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lf)
        # self.scheduler.last_epoch = 0
        self.scheduler = lr_scheduler.CosineAnnealingLR(self.optimizer,
                                                        epochs,
                                                        eta_min=0,
                                                        last_epoch=-1)
        self.epoch = 0
        self.best_loss = 99999

        HighD_dataset = HighD_Dataset(X_len=config['X_len'],
                                      X_step=config['X_step'],
                                      Y_len=config['Y_len'],
                                      Y_step=config['Y_step'],
                                      diff=config['diff'],
                                      name='data_01',
                                      raw_dir='./dataset/',
                                      preprocess_all=True,
                                      device=self.device)

        n_val = int(len(HighD_dataset) * config['val_percent'])
        n_train = len(HighD_dataset) - n_val
        train_dataset, val_dataset = random_split(
            HighD_dataset, [n_train, n_val],
            generator=torch.Generator().manual_seed(2021))

        self.train_dataloader = GraphDataLoader(
            train_dataset,
            batch_size=32,
            shuffle=True,
            pin_memory=(self.device == "cuda"))
        self.val_dataloader = GraphDataLoader(
            val_dataset,
            batch_size=32,
            shuffle=False,
            pin_memory=(self.device == "cuda"))
        print("Dataset Ready!")
Ejemplo n.º 2
0
    def __init__(self,
                 dataset,
                 batch_size,
                 device,
                 collate_fn=None,
                 seed=0,
                 shuffle=True,
                 split_name='fold10',
                 fold_idx=0,
                 split_ratio=0.7):

        self.shuffle = shuffle
        self.seed = seed
        self.kwargs = {'pin_memory': True} if 'cuda' in device.type else {}

        labels = [l for _, l in dataset]

        if split_name == 'fold10':
            train_idx, valid_idx = self._split_fold10(
                labels, fold_idx, seed, shuffle)
        elif split_name == 'rand':
            train_idx, valid_idx = self._split_rand(
                labels, split_ratio, seed, shuffle)
        else:
            raise NotImplementedError()

        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetRandomSampler(valid_idx)

        self.train_loader = GraphDataLoader(
            dataset, sampler=train_sampler,
            batch_size=batch_size, collate_fn=collate_fn, **self.kwargs)
        self.valid_loader = GraphDataLoader(
            dataset, sampler=valid_sampler,
            batch_size=batch_size, collate_fn=collate_fn, **self.kwargs)
Ejemplo n.º 3
0
def main(args):
    # Step 1: Prepare graph data and retrieve train/validation/test index ============================= #
    dataset = LegacyTUDataset(args.dataset, raw_dir=args.dataset_path)

    # add self loop. We add self loop for each graph here since the function "add_self_loop" does not
    # support batch graph.
    for i in range(len(dataset)):
        dataset.graph_lists[i] = dgl.add_self_loop(dataset.graph_lists[i])

    num_training = int(len(dataset) * 0.8)
    num_val = int(len(dataset) * 0.1)
    num_test = len(dataset) - num_val - num_training
    train_set, val_set, test_set = random_split(dataset, [num_training, num_val, num_test])

    train_loader = GraphDataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=6)
    val_loader = GraphDataLoader(val_set, batch_size=args.batch_size, num_workers=2)
    test_loader = GraphDataLoader(test_set, batch_size=args.batch_size, num_workers=2)

    device = torch.device(args.device)
    
    # Step 2: Create model =================================================================== #
    num_feature, num_classes, _ = dataset.statistics()
    model_op = get_sag_network(args.architecture)
    model = model_op(in_dim=num_feature, hid_dim=args.hid_dim, out_dim=num_classes,
                     num_convs=args.conv_layers, pool_ratio=args.pool_ratio, dropout=args.dropout).to(device)
    args.num_feature = int(num_feature)
    args.num_classes = int(num_classes)

    # Step 3: Create training components ===================================================== #
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)

    # Step 4: training epoches =============================================================== #
    bad_cound = 0
    best_val_loss = float("inf")
    final_test_acc = 0.
    best_epoch = 0
    train_times = []
    for e in range(args.epochs):
        s_time = time()
        train_loss = train(model, optimizer, train_loader, device)
        train_times.append(time() - s_time)
        val_acc, val_loss = test(model, val_loader, device)
        test_acc, _ = test(model, test_loader, device)
        if best_val_loss > val_loss:
            best_val_loss = val_loss
            final_test_acc = test_acc
            bad_cound = 0
            best_epoch = e + 1
        else:
            bad_cound += 1
        if bad_cound >= args.patience:
            break
        
        if (e + 1) % args.print_every == 0:
            log_format = "Epoch {}: loss={:.4f}, val_acc={:.4f}, final_test_acc={:.4f}"
            print(log_format.format(e + 1, train_loss, val_acc, final_test_acc))
    print("Best Epoch {}, final test acc {:.4f}".format(best_epoch, final_test_acc))
    return final_test_acc, sum(train_times) / len(train_times)
Ejemplo n.º 4
0
def main(args):
    # Step 1: Prepare graph data and retrieve train/validation/test index ============================= #
    dataset = LegacyTUDataset(args.dataset, raw_dir=args.dataset_path)

    # add self loop. We add self loop for each graph here since the function "add_self_loop" does not
    # support batch graph.
    for i in range(len(dataset)):
        dataset.graph_lists[i] = dgl.remove_self_loop(dataset.graph_lists[i])
        dataset.graph_lists[i] = dgl.add_self_loop(dataset.graph_lists[i])
    
    # preprocess: use node degree/label as node feature
    if args.degree_as_feature:
        dataset = degree_as_feature(dataset)
        mode = "concat"
    else:
        mode = "replace"
    dataset = node_label_as_feature(dataset, mode=mode)

    num_training = int(len(dataset) * 0.9)
    num_test = len(dataset) - num_training
    train_set, test_set = random_split(dataset, [num_training, num_test])

    train_loader = GraphDataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=1)
    test_loader = GraphDataLoader(test_set, batch_size=args.batch_size, num_workers=1)

    device = torch.device(args.device)
    
    # Step 2: Create model =================================================================== #
    num_feature, num_classes, _ = dataset.statistics()
    args.in_dim = int(num_feature)
    args.out_dim = int(num_classes)
    args.edge_feat_dim = 0 # No edge feature in datasets that we use.
    
    model = GraphClassifier(args).to(device)

    # Step 3: Create training components ===================================================== #
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True, weight_decay=args.weight_decay)

    # Step 4: training epoches =============================================================== #
    best_test_acc = 0.0
    best_epoch = -1
    train_times = []
    for e in range(args.epochs):
        s_time = time()
        train_loss = train(model, optimizer, train_loader, device,
                           e, args.epochs)
        train_times.append(time() - s_time)
        test_acc = test(model, test_loader, device)
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_epoch = e + 1

        if (e + 1) % args.print_every == 0:
            log_format = "Epoch {}: loss={:.4f}, test_acc={:.4f}, best_test_acc={:.4f}"
            print(log_format.format(e + 1, train_loss, test_acc, best_test_acc))
    print("Best Epoch {}, final test acc {:.4f}".format(best_epoch, best_test_acc))
    return best_test_acc, sum(train_times) / len(train_times)
Ejemplo n.º 5
0
def load_ogbg(name,
              device=th.device('cpu'),
              root='/home/eva_share_users/zhuyu'):
    from ogb.graphproppred import DglGraphPropPredDataset

    print('load', name)
    data = DglGraphPropPredDataset(name=name, root=root)
    #from IPython import embed; embed()
    from tqdm import tqdm
    out_channels = 0
    for graph in tqdm(data):
        if name == 'ogbg-ppa':
            graph[0].ndata['feat'] = dgl.ops.copy_e_mean(
                graph[0], graph[0].edata['feat'])
        else:
            ef = graph[0].edata['feat']
            edge = graph[0].edges()[1]
            H = th.zeros(graph[0].num_nodes(), 3)
            for i in range(graph[0].num_nodes()):
                mask = th.eq(edge, i)
                H[i, :] += th.matmul(mask.float(), ef.float())
                H[i, :] /= graph[0].in_degrees(i)
            graph[0].ndata['feat'] = th.cat((graph[0].ndata['feat'], H), dim=1)
        #from IPython import embed; embed()
        in_channels = graph[0].ndata['feat'].shape[1]
        try:
            out_channels = max(out_channels, int(graph[1]))
        except:
            from IPython import embed
            embed()

    split_idx = data.get_idx_split()
    print('finish loading', name)
    from dgl.dataloading import GraphDataLoader
    train_loader = GraphDataLoader(
        data[split_idx['train']],
        batch_size=256,
        shuffle=True,
    )
    valid_loader = GraphDataLoader(
        data[split_idx['valid']],
        batch_size=256,
        shuffle=True,
    )
    test_loader = GraphDataLoader(
        data[split_idx['test']],
        batch_size=256,
        shuffle=True,
    )
    #from IPython import embed; embed()
    return train_loader, valid_loader, test_loader, in_channels, out_channels + 1
Ejemplo n.º 6
0
def get_dataloaders(dataset, seed, batch_size=32):
    # Use a 80:10:10 train-val-test split
    train_set, val_set, test_set = split_dataset(dataset,
                                                 frac_list=[0.8, 0.1, 0.1],
                                                 shuffle=True,
                                                 random_state=seed)
    train_loader = GraphDataLoader(train_set,
                                   use_ddp=True,
                                   batch_size=batch_size,
                                   shuffle=True)
    val_loader = GraphDataLoader(val_set, batch_size=batch_size)
    test_loader = GraphDataLoader(test_set, batch_size=batch_size)

    return train_loader, val_loader, test_loader
Ejemplo n.º 7
0
    def test_all(self, dataset: AllDataset, output_dir: str = "test_result"):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(
                f"make new dir {os.path.abspath(output_dir)}, and write files into it."
            )
        else:
            print(f'output dir {os.path.abspath(output_dir)} exists !')
        self.load()
        self.eval()
        data_loader = GraphDataLoader(dataset.test,
                                      collate_fn=collate,
                                      batch_size=10,
                                      shuffle=False,
                                      drop_last=False)
        start_time = time.time()
        file_name_index = 1
        for i, (bhg, info) in enumerate(data_loader):
            batch_size = len(info)
            self.forward(bhg)
            for idi, (cg, cd) in enumerate(zip(dgl.unbatch(bhg), info)):
                track_pd_list = graph_and_info_to_df_list(cg, cd)
                # todo
                # pd.set_option('display.max_columns', 10000)
                # print(track_pd_list[0])
                for i_df, df in enumerate(track_pd_list):
                    df.to_csv(os.path.join(output_dir,
                                           str(file_name_index) + ".csv"),
                              index=False)
                    file_name_index += 1

        self.train()
        print(
            f"test time is :{time.time() - start_time:6.2f} s | num_samples : {len(dataset.test)}"
        )
Ejemplo n.º 8
0
    def test_model(self, dataset: AllDataset, output_dir: str = "test_result"):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(
                f"make new dir {os.path.abspath(output_dir)}, and write files into it."
            )
        else:
            print(f'output dir {os.path.abspath(output_dir)} exists !')
        self.load()
        self.eval()
        data_loader = GraphDataLoader(dataset.test,
                                      collate_fn=collate,
                                      batch_size=10,
                                      shuffle=False,
                                      drop_last=False)
        start_time = time.time()
        for i, (bhg, info) in enumerate(data_loader):
            batch_size = len(info)
            self.forward(bhg)
            y_pred: torch.FloatTensor = bhg.nodes['agent'].data['predict']
            assert batch_size == y_pred.shape[0]
            print(f"\rprocessed {i+1}/{len(data_loader)} ", end="")
            for n, d in enumerate(info):
                st = float(d['split_time'])
                x, y = d['radix']['x'], d['radix']['y']
                timestamp = pd.Series(np.linspace(st + 0.1,
                                                  st + 3.0,
                                                  30,
                                                  dtype=np.float),
                                      name="TIMESTAMP")
                track_id = pd.Series([d['agent_track_id'] for _ in range(30)],
                                     name="TRACK_ID")
                object_type = pd.Series(["AGENT" for _ in range(30)],
                                        name="OBJECT_TYPE")
                x = pd.Series(y_pred[n, :, 0] + x, name="X")
                y = pd.Series(y_pred[n, :, 1] + y, name="Y")
                city_name = pd.Series([d['city'] for _ in range(30)],
                                      name="CITY_NAME")
                this_df = pd.DataFrame(
                    list(zip(timestamp, track_id, object_type, x, y,
                             city_name)),
                    columns=("TIMESTAMP", "TRACK_ID", "OBJECT_TYPE", "X", "Y",
                             "CITY_NAME"))
                stack_df = pd.concat(objs=[d['df'], this_df])
                # select the 京东 agent object
                stack_df = stack_df[stack_df["OBJECT_TYPE"] == "AGENT"]
                stack_df.to_csv(os.path.join(output_dir,
                                             d['filename'] + ".csv"),
                                index=False)

                # pd.set_option('display.max_columns', 1000)
                # print(this_df)

        self.train()
        print(
            f"test time is :{time.time() - start_time:6.2f} s | num_samples : {len(dataset.test)}"
        )
Ejemplo n.º 9
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--repeat", type=int, default=10)
    parser.add_argument('--dataset', type=str, choices=['MUTAG', 'COLLAB', 'IMDBBINARY', 'IMDBMULTI', 'NCI1', 'PROTEINS', 'PTC', 'REDDITBINARY', 'REDDITMULTI5K'], default='MUTAG')

    args = parser.parse_args()

    device = torch.device('cuda')
    dataset_ = GINDataset(args.dataset, False)
    dataset = DatasetAbstraction([g[0] for g in dataset_], [g[1] for g in dataset_])
    
    # 1. split dataset [fix split]
    dataids = list(range(len(dataset)))
    random.seed(2021)
    random.shuffle(dataids)
    
    fold = int(len(dataset) * 0.1)
    train_dataset = dataset[dataids[:fold * 8]]
    val_dataset = dataset[dataids[fold * 8: fold * 9]]
    test_dataset = dataset[dataids[fold * 9: ]]

    trainloader = GraphDataLoader(train_dataset, batch_size=32, shuffle=True)
    valloader = GraphDataLoader(val_dataset, batch_size=32, shuffle=False)
    testloader = GraphDataLoader(test_dataset, batch_size=32, shuffle=False)

    accs = []
    for seed in tqdm(range(args.repeat)):
        # set up seeds, args.seed supported
        set_seed(seed)

        model = GIN(
            5, 2, dataset_.dim_nfeats, 64, dataset_.gclasses, 0.5, False,
            "sum", "sum").to(device)

        criterion = nn.CrossEntropyLoss()  # defaul reduce is true
        optimizer = optim.Adam(model.parameters(), lr=0.0001)

        model = train(model, trainloader, valloader, optimizer, criterion, 100, device)
        acc = eval_net(model, testloader, device)
        accs.append(acc)

    print('{:.2f} ~ {:.2f}'.format(np.mean(accs) * 100, np.std(accs) * 100))
Ejemplo n.º 10
0
def test_gcnnet_batched_graph(small_dataset):
    net_params = NetParams.from_file("../graph_conn/configs/test_gcn.json")
    net_params.readout = 'flatten'
    model = GCNNet(net_params=net_params)
    dataloader = GraphDataLoader(small_dataset, batch_size=3)
    batched_graph, labels = next(iter(dataloader))
    h = batched_graph.ndata['feat']
    e = batched_graph.edata['weight']
    out = model(batched_graph, h, e)
    assert True
Ejemplo n.º 11
0
    def make_loader(self, dataset):
        """

        Args:
            dataset: dataset instance from conn_dataset

        Returns:

        """
        split = dataset.get_split_idx(test_size=self.net_params.test_size,
                                      val_size=self.net_params.val_size)
        test_dataset = dataset[split['test']]
        train_dataset = dataset[split['train']]
        val_dataset = dataset[split['train']]
        train_loader = GraphDataLoader(train_dataset,
                                       batch_size=self.net_params.batch_size)
        test_loader = GraphDataLoader(test_dataset,
                                      batch_size=self.net_params.batch_size)
        val_loader = GraphDataLoader(val_dataset,
                                     batch_size=self.net_params.batch_size)
        return train_loader, test_loader, val_loader
Ejemplo n.º 12
0
def get_ppi():
    train_dataset = PPIDataset(mode='train')
    val_dataset = PPIDataset(mode='valid')
    test_dataset = PPIDataset(mode='test')
    train_val_dataset = [i for i in train_dataset] + [i for i in val_dataset]
    for idx, data in enumerate(train_val_dataset):
        data.ndata['batch'] = torch.zeros(data.number_of_nodes()) + idx
        data.ndata['batch'] = data.ndata['batch'].long()

    g = list(GraphDataLoader(train_val_dataset, batch_size=22, shuffle=True))

    return g, PPIDataset(mode='train'), PPIDataset(mode='valid'), test_dataset
Ejemplo n.º 13
0
def main():
    dataset = UserItemDataset()
    dataloader = GraphDataLoader(dataset, batch_size=32, shuffle=True)

    model = HeteroClassifier(dataset.n_features, 20, dataset.num_classes,
                             dataset[0][0].etypes)
    opt = optim.Adam(model.parameters())

    for epoch in range(5):
        for batched_graph, labels in dataloader:
            logits = model(batched_graph)
            loss = F.cross_entropy(logits, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()
            print(loss.item())
Ejemplo n.º 14
0
def main():
    dataset = dgl.data.GINDataset('MUTAG', False)
    dataloader = GraphDataLoader(dataset, batch_size=32, shuffle=True)

    model = Model(dataset.dim_nfeats, 20, dataset.gclasses)
    opt = optim.Adam(model.parameters())

    for epoch in range(5):
        for batched_graph, labels in dataloader:
            feats = batched_graph.ndata['attr'].float()
            logits = model(batched_graph, feats)
            loss = F.cross_entropy(logits, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()
            print(loss.item())
Ejemplo n.º 15
0
 def predict(self):
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     predicted_labels = []
     idx = torch.randperm(len(self.validationDataset))
     num_train = int(len(self.validationDataset))
     sampler = SubsetRandomSampler(idx[:num_train])
     dataloader = GraphDataLoader(self.validationDataset,
                                  sampler=sampler,
                                  batch_size=1,
                                  drop_last=False)
     num_correct = 0
     num_tests = 0
     for batched_graph, labels in dataloader:
         pred = self.model(
             batched_graph,
             batched_graph.ndata[self.node_attr_key].float()).to(device)
         num_correct += (pred.argmax(1) == labels).sum().item()
         num_tests += len(labels)
     accuracy = num_correct / num_tests
     return accuracy
Ejemplo n.º 16
0
    def val_model(self, dataset: AllDataset, return_to_plot=False):
        if not self.training:
            self.load()
        self.eval()
        data_loader = GraphDataLoader(
            dataset.val,
            collate_fn=collate,
            batch_size=int(10 if not return_to_plot else 1),
            shuffle=False,
            drop_last=False)
        start_time = time.time()
        real_queue = deque()
        for i, (bhg, info) in enumerate(data_loader):
            print(
                f"\r {i+1}/{len(data_loader)} | elapse time: {time.time() - start_time}",
                end="")
            self.forward(bhg)
            agent_pred = bhg.nodes['agent'].data['predict']
            agent_true = bhg.nodes['agent'].data['state'][:, 20:, :]

            real_lose = torch.square(agent_pred - agent_true).flatten().view(
                -1, 2)
            real_lose = torch.sum(real_lose, dim=1)
            real_lose = torch.sqrt(real_lose)
            real_lose = torch.mean(real_lose)
            real_queue.append(real_lose)
            if return_to_plot:
                val_plot(bhg)
        print(
            "-------------------------------------evaluation---------------------------------------------"
        )
        print(
            f"val total time elapse: {time.time() - start_time:6.2f} s| #samples : {len(dataset.val)}"
            f" loss : {sum(real_queue) / len(real_queue):6.4f} m")
        print(
            "--------------------------------------------------------------------------------------------"
        )
        self.train()
Ejemplo n.º 17
0
 def validate(self):
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # Set training to 100% of the data, validate, and save a final model
     idx = torch.randperm(len(self.trainingDataset))
     num_train = int(len(self.trainingDataset))
     sampler = SubsetRandomSampler(idx[:num_train])
     dataloader = GraphDataLoader(self.trainingDataset,
                                  sampler=sampler,
                                  batch_size=self.hparams.batch_size,
                                  drop_last=False)
     # Once a model is chosen, train on all the data and save
     for e in range(self.hparams.epochs):
         num_correct = 0
         num_tests = 0
         for batched_graph, labels in dataloader:
             #pred = self.model(batched_graph, batched_graph.ndata['attr'].float()).to(device)
             pred = self.model(
                 batched_graph,
                 batched_graph.ndata[self.node_attr_key].float()).to(device)
             if self.hparams.loss_function == "Negative Log Likelihood":
                 logp = F.log_softmax(pred, 1)
                 loss = F.nll_loss(logp, labels)
             elif self.hparams.loss_function == "Cross Entropy":
                 loss = F.cross_entropy(pred, labels)
             num_correct += (pred.argmax(1) == labels).sum().item()
             num_tests += len(labels)
             self.optimizer.zero_grad()
             loss.backward()
             self.optimizer.step()
         training_accuracy = num_correct / num_tests
         validation_accuracy = self.predict()
         if validation_accuracy >= training_accuracy and validation_accuracy > 0.6:
             break
     print("Validation - Stopped at Epoch:", e + 1)
     if self.hparams.checkpoint_path is not None:
         # Save the entire model
         torch.save(self.model, self.hparams.checkpoint_path)
Ejemplo n.º 18
0
def main(args):
    # Step 1: Prepare graph data and retrieve train/validation/test index ============================= #
    # Load from DGL dataset
    train_dataset = PPIDataset(mode='train')
    valid_dataset = PPIDataset(mode='valid')
    test_dataset = PPIDataset(mode='test')

    # data loader
    train_loader = GraphDataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    valid_loader = GraphDataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False)
    test_loader = GraphDataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)

    graph = train_dataset[0]

    # check cuda
    device = f'cuda:{args.gpu}' if args.gpu >= 0 and torch.cuda.is_available() else 'cpu'

    # retrieve the number of classes
    n_classes = train_dataset.num_labels

    # Extract node features
    n_features = graph.ndata['feat'].shape[1]

    # Step 2: Create model =================================================================== #
    model = ARMA4NC(in_dim=n_features,
                    hid_dim=args.hid_dim,
                    out_dim=n_classes,
                    num_stacks=args.num_stacks,
                    num_layers=args.num_layers,
                    activation=nn.ReLU(),
                    dropout=args.dropout).to(device)
    
    best_model = copy.deepcopy(model)

    # Step 3: Create training components ===================================================== #
    loss_fn = nn.BCEWithLogitsLoss()
    opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.lamb)

    # Step 4: training epoches =============================================================== #
    f1 = 0
    no_improvement = 0
    epochs = trange(args.epochs, desc='F1 & Loss')

    for _ in epochs:
        # Training
        train_loss, train_f1 = train(device, model, opt, loss_fn, train_loader)

        # Validation
        valid_loss, valid_f1 = evaluate(device, model, loss_fn, valid_loader)

        # Print out performance
        epochs.set_description(f'Train Loss {train_loss:.4f} | Train F1 {train_f1:.4f} | Valid Loss {valid_loss:.4f} | Valid F1 {valid_f1:.4f}')
        
        if valid_f1 < f1:
            no_improvement += 1
            if no_improvement == args.early_stopping:
                print('Early stop.')
                break
        else:
            no_improvement = 0
            f1 = valid_f1
            best_model = copy.deepcopy(model)

    _, test_f1 = evaluate(device, best_model, loss_fn, test_loader)

    print(f'Test F1 {test_f1:.4f}')
    return test_f1
Ejemplo n.º 19
0
def learn(model_params, experiment_number, dataset):
    split_rate = model_params['split_rate']
    epochs = model_params['epochs']
    lr = model_params['lr']
    batch_size = model_params['batch_size']

    print('-' * 50)
    print(f'Model Hyper-parameters')
    print('-' * 50)
    print(f'Epochs: {epochs}')
    print(f'Split Rate: {split_rate}')
    print(f'Learning Rate: {lr}')
    print(f'Batch Size: {batch_size}')
    print('-' * 50)

    log.write('-' * 100 + '\n')
    log.write(f'Experiment #{experiment_number}\n')
    log.write(f'Model Hyper-parameters\n')
    log.write('-' * 100 + '\n')
    log.write(f'Epochs: {epochs}\n')
    log.write(f'Split Rate: {split_rate}\n')
    log.write(f'Learning Rate: {lr}\n')
    log.write(f'Batch Size: {batch_size}\n')
    log.write('-' * 100 + '\n')
    log.flush()
    workers_count = min(int(multiprocessing.cpu_count() * 0.8), batch_size)
    num_train = int(num_examples * split_rate)

    train_sampler = SubsetRandomSampler(torch.arange(num_train))
    test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

    train_dataloader = GraphDataLoader(dataset,
                                       sampler=train_sampler,
                                       batch_size=batch_size,
                                       drop_last=False,
                                       num_workers=workers_count)
    test_dataloader = GraphDataLoader(dataset,
                                      sampler=test_sampler,
                                      batch_size=batch_size,
                                      drop_last=False,
                                      num_workers=workers_count)

    # 모델 설정
    model = Classifier(1, 256, 5)
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()

    # 학습 시각적 효과
    tqdm_train_descr_format = "Training GNN Feed-Forward model: Epoch Accuracy = {:02.4f}%, Loss = {:.8f}"
    tqdm_train_descr = tqdm_train_descr_format.format(0, float('inf'))
    tqdm_train_obj = tqdm(range(epochs), desc=tqdm_train_descr)

    # 학습
    train_losses = []
    train_accuracy = []
    print(f'Training Starting...')
    log.write(f'Training Starting...' + '\n')
    log.flush()

    for i in tqdm_train_obj:
        epoch_corr = 0
        epoch_loss = 0
        total_samples = 0

        for b, (X_train, y_train) in enumerate(train_dataloader):
            y_prediction = model(X_train)
            loss = loss_func(y_prediction, y_train)

            predicted = torch.max(y_prediction.data, 1)[1]
            batch_corr = (predicted == y_train).sum()
            epoch_corr += batch_corr.detach().item()
            epoch_loss += loss.detach().item()
            total_samples += y_prediction.shape[0]

            # Update parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        epoch_accuracy = epoch_corr * 100 / total_samples
        epoch_loss = epoch_loss / total_samples
        print(f'Epoch {i}, accuracy: {epoch_accuracy}, loss: {epoch_loss}')
        log.write(
            f'Epoch {i}, accuracy: {epoch_accuracy}, loss: {epoch_loss}\n')
        log.flush()

        train_losses.append(epoch_loss)
        train_accuracy.append(epoch_accuracy)

        tqdm_descr = tqdm_train_descr_format.format(epoch_accuracy, epoch_loss)
        tqdm_train_obj.set_description(tqdm_descr)

    # 테스트 시각적 효과
    print(f'Testing Starting...')
    log.write(f'Testing Starting...' + '\n')
    log.flush()
    tqdm_test_descr_format = "Testing GNN Feed-Forward model: Batch Accuracy = {:02.4f}%"
    tqdm_test_descr = tqdm_test_descr_format.format(0)
    tqdm_test_obj = tqdm(test_dataloader, desc=tqdm_test_descr)
    num_of_batches = len(test_dataloader)
    model.eval()

    # 테스트
    total_test_sample = 0
    total_sampled_test_acc = 0
    total_argmax_test_acc = 0

    with torch.no_grad():
        for b, (X_test, y_test) in enumerate(tqdm_test_obj):
            predictions = model(X_test)
            y_test = torch.tensor(y_test).float().view(-1, 1)
            y_predicted = torch.softmax(predictions, 1)

            y_sampled = torch.multinomial(y_predicted, 1)
            y_argmax = torch.max(y_predicted, 1)[1].view(-1, 1)

            total_sampled_test_acc += (
                y_test == y_sampled.float()).sum().item()
            total_argmax_test_acc += (y_test == y_argmax.float()).sum().item()
            total_test_sample += predictions.shape[0]

            # tqdm_descr = tqdm_train_descr_format.format(total_sampled_test_acc)
            # tqdm_train_obj.set_description(tqdm_descr)

    print(f'The total number of test dataset: {total_test_sample}')
    print('Accuracy of sampled predictions on the test set: {:.4f}%'.format(
        total_sampled_test_acc * 100 / total_test_sample))
    print('Accuracy of argmax predictions on the test set: {:4f}%'.format(
        total_argmax_test_acc * 100 / total_test_sample))
    log.write('-' * 100 + '\n')
    log.write(f'The total number of test dataset: {total_test_sample}\n')
    log.write(
        'Accuracy of sampled predictions on the test set: {:.4f}%\n'.format(
            total_sampled_test_acc * 100 / total_test_sample))
    log.write(
        'Accuracy of argmax predictions on the test set: {:4f}%\n'.format(
            total_argmax_test_acc * 100 / total_test_sample))
    log.write('-' * 100 + '\n')
    log.flush()
Ejemplo n.º 20
0
def main(args, print_fn=print):
    print_fn("Experiment arguments: {}".format(args))

    if args.random_seed:
        torch.manual_seed(args.random_seed)
    else:
        torch.manual_seed(123)
    # Load dataset
    if args.dataset.startswith('ogbl'):
        graph, split_edge = load_ogb_dataset(args.dataset)
    else:
        raise NotImplementedError

    num_nodes = graph.num_nodes()

    # set gpu
    if args.gpu_id >= 0 and torch.cuda.is_available():
        device = 'cuda:{}'.format(args.gpu_id)
    else:
        device = 'cpu'

    if args.dataset == 'ogbl-collab':
        # ogbl-collab dataset is multi-edge graph
        use_coalesce = True
    else:
        use_coalesce = False

    # Generate positive and negative edges and corresponding labels
    # Sampling subgraphs and generate node labeling features
    seal_data = SEALData(g=graph, split_edge=split_edge, hop=args.hop, neg_samples=args.neg_samples,
                         subsample_ratio=args.subsample_ratio, use_coalesce=use_coalesce, prefix=args.dataset,
                         save_dir=args.save_dir, num_workers=args.num_workers, print_fn=print_fn)
    node_attribute = seal_data.ndata['feat']
    edge_weight = seal_data.edata['weight'].float()

    train_data = seal_data('train')
    val_data = seal_data('valid')
    test_data = seal_data('test')

    train_graphs = len(train_data.graph_list)

    # Set data loader

    train_loader = GraphDataLoader(train_data, batch_size=args.batch_size, num_workers=args.num_workers)
    val_loader = GraphDataLoader(val_data, batch_size=args.batch_size, num_workers=args.num_workers)
    test_loader = GraphDataLoader(test_data, batch_size=args.batch_size, num_workers=args.num_workers)

    # set model
    if args.model == 'gcn':
        model = GCN(num_layers=args.num_layers,
                    hidden_units=args.hidden_units,
                    gcn_type=args.gcn_type,
                    pooling_type=args.pooling,
                    node_attributes=node_attribute,
                    edge_weights=edge_weight,
                    node_embedding=None,
                    use_embedding=True,
                    num_nodes=num_nodes,
                    dropout=args.dropout)
    elif args.model == 'dgcnn':
        model = DGCNN(num_layers=args.num_layers,
                      hidden_units=args.hidden_units,
                      k=args.sort_k,
                      gcn_type=args.gcn_type,
                      node_attributes=node_attribute,
                      edge_weights=edge_weight,
                      node_embedding=None,
                      use_embedding=True,
                      num_nodes=num_nodes,
                      dropout=args.dropout)
    else:
        raise ValueError('Model error')

    model = model.to(device)
    parameters = model.parameters()
    optimizer = torch.optim.Adam(parameters, lr=args.lr)
    loss_fn = BCEWithLogitsLoss()
    print_fn("Total parameters: {}".format(sum([p.numel() for p in model.parameters()])))

    # train and evaluate loop
    summary_val = []
    summary_test = []
    for epoch in range(args.epochs):
        start_time = time.time()
        loss = train(model=model,
                     dataloader=train_loader,
                     loss_fn=loss_fn,
                     optimizer=optimizer,
                     device=device,
                     num_graphs=args.batch_size,
                     total_graphs=train_graphs)
        train_time = time.time()
        if epoch % args.eval_steps == 0:
            val_pos_pred, val_neg_pred = evaluate(model=model,
                                                  dataloader=val_loader,
                                                  device=device)
            test_pos_pred, test_neg_pred = evaluate(model=model,
                                                    dataloader=test_loader,
                                                    device=device)

            val_metric = evaluate_hits(args.dataset, val_pos_pred, val_neg_pred, args.hits_k)
            test_metric = evaluate_hits(args.dataset, test_pos_pred, test_neg_pred, args.hits_k)
            evaluate_time = time.time()
            print_fn("Epoch-{}, train loss: {:.4f}, hits@{}: val-{:.4f}, test-{:.4f}, "
                     "cost time: train-{:.1f}s, total-{:.1f}s".format(epoch, loss, args.hits_k, val_metric, test_metric,
                                                                      train_time - start_time,
                                                                      evaluate_time - start_time))
            summary_val.append(val_metric)
            summary_test.append(test_metric)

    summary_test = np.array(summary_test)

    print_fn("Experiment Results:")
    print_fn("Best hits@{}: {:.4f}, epoch: {}".format(args.hits_k, np.max(summary_test), np.argmax(summary_test)))
Ejemplo n.º 21
0
    def train(self):
        # The number of folds (This should come from the hparams)
        k_folds = self.hparams.k_folds

        # Init the loss and accuracy reporting lists
        self.training_accuracy_list = []
        self.training_loss_list = []
        self.testing_accuracy_list = []
        self.testing_loss_list = []

        # Set fixed random number seed
        torch.manual_seed(42)

        # Define the K-fold Cross Validator
        kfold = KFold(n_splits=k_folds, shuffle=True)

        # K-fold Cross-validation model evaluation
        for fold, (train_ids,
                   test_ids) in enumerate(kfold.split(self.trainingDataset)):
            epoch_training_loss_list = []
            epoch_training_accuracy_list = []
            epoch_testing_loss_list = []
            epoch_testing_accuracy_list = []
            # Sample elements randomly from a given list of ids, no replacement.
            train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
            test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

            # Define data loaders for training and testing data in this fold
            self.train_dataloader = GraphDataLoader(
                self.trainingDataset,
                sampler=train_subsampler,
                batch_size=self.hparams.batch_size,
                drop_last=False)
            self.test_dataloader = GraphDataLoader(
                self.trainingDataset,
                sampler=test_subsampler,
                batch_size=self.hparams.batch_size,
                drop_last=False)
            # Init the neural network
            self.model.apply(reset_weights)

            # Run the training loop for defined number of epochs
            for _ in range(self.hparams.epochs):
                num_correct = 0
                num_tests = 0
                training_temp_loss_list = []

                # Iterate over the DataLoader for training data
                for batched_graph, labels in self.train_dataloader:

                    # Zero the gradients
                    self.optimizer.zero_grad()

                    # Perform forward pass
                    pred = self.model(
                        batched_graph,
                        batched_graph.ndata[self.node_attr_key].float())

                    # Compute loss
                    if self.hparams.loss_function == "Negative Log Likelihood":
                        logp = F.log_softmax(pred, 1)
                        loss = F.nll_loss(logp, labels)
                    elif self.hparams.loss_function == "Cross Entropy":
                        loss = F.cross_entropy(pred, labels)

                    # Save loss information for reporting
                    training_temp_loss_list.append(loss.item())
                    num_correct += (pred.argmax(1) == labels).sum().item()
                    num_tests += len(labels)

                    # Perform backward pass
                    loss.backward()

                    # Perform optimization
                    self.optimizer.step()

                self.training_accuracy = num_correct / num_tests
                epoch_training_accuracy_list.append(self.training_accuracy)
                epoch_training_loss_list.append(
                    sum(training_temp_loss_list) /
                    len(training_temp_loss_list))
                self.test()
                epoch_testing_accuracy_list.append(self.testing_accuracy)
                epoch_testing_loss_list.append(self.testing_loss)
            if self.hparams.checkpoint_path is not None:
                # Save the entire model
                torch.save(self.model,
                           self.hparams.checkpoint_path + "-fold_" + str(fold))
            self.training_accuracy_list.append(epoch_training_accuracy_list)
            self.training_loss_list.append(epoch_training_loss_list)
            self.testing_accuracy_list.append(epoch_testing_accuracy_list)
            self.testing_loss_list.append(epoch_testing_loss_list)
# `torch.utils.data.sampler <https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler>`__.
# For example, this tutorial creates a training ``GraphDataLoader`` and
# test ``GraphDataLoader``, using ``SubsetRandomSampler`` to tell PyTorch
# to sample from only a subset of the dataset.
# 

from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(dataset)
num_train = int(num_examples * 0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=5, drop_last=False)


######################################################################
# You can try to iterate over the created ``GraphDataLoader`` and see what it
# gives:
# 

it = iter(train_dataloader)
batch = next(it)
print(batch)


######################################################################
Ejemplo n.º 23
0
def main(args):
    if args.gpu<0:
        device = torch.device("cpu")
    else:
        device = torch.device("cuda:" + str(args.gpu))

    batch_size = args.batch_size
    cur_step = 0
    patience = args.patience
    best_score = -1
    best_loss = 10000
    # define loss function
    loss_fcn = torch.nn.BCEWithLogitsLoss()
    # create the dataset
    train_dataset = PPIDataset(mode='train')
    valid_dataset = PPIDataset(mode='valid')
    test_dataset = PPIDataset(mode='test')
    train_dataloader = GraphDataLoader(train_dataset, batch_size=batch_size)
    valid_dataloader = GraphDataLoader(valid_dataset, batch_size=batch_size)
    test_dataloader = GraphDataLoader(test_dataset, batch_size=batch_size)
    g = train_dataset[0]
    n_classes = train_dataset.num_labels
    num_feats = g.ndata['feat'].shape[1]
    g = g.int().to(device)
    heads = ([args.num_heads] * (args.num_layers-1)) + [args.num_out_heads]
    # define the model
    model = GAT(g,
                args.num_layers,
                num_feats,
                args.num_hidden,
                n_classes,
                heads,
                F.elu,
                args.in_drop,
                args.attn_drop,
                args.alpha,
                args.residual)
    # define the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    model = model.to(device)
    for epoch in range(args.epochs):
        model.train()
        loss_list = []
        for batch, subgraph in enumerate(train_dataloader):
            subgraph = subgraph.to(device)
            model.g = subgraph
            for layer in model.gat_layers:
                layer.g = subgraph
            logits = model(subgraph.ndata['feat'].float())
            loss = loss_fcn(logits, subgraph.ndata['label'])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())
        loss_data = np.array(loss_list).mean()
        print("Epoch {:05d} | Loss: {:.4f}".format(epoch + 1, loss_data))
        if epoch % 5 == 0:
            score_list = []
            val_loss_list = []
            for batch, subgraph in enumerate(valid_dataloader):
                subgraph = subgraph.to(device)
                score, val_loss = evaluate(subgraph.ndata['feat'], model, subgraph, subgraph.ndata['label'], loss_fcn)
                score_list.append(score)
                val_loss_list.append(val_loss)
            mean_score = np.array(score_list).mean()
            mean_val_loss = np.array(val_loss_list).mean()
            print("Val F1-Score: {:.4f} ".format(mean_score))
            # early stop
            if mean_score > best_score or best_loss > mean_val_loss:
                if mean_score > best_score and best_loss > mean_val_loss:
                    val_early_loss = mean_val_loss
                    val_early_score = mean_score
                best_score = np.max((mean_score, best_score))
                best_loss = np.min((best_loss, mean_val_loss))
                cur_step = 0
            else:
                cur_step += 1
                if cur_step == patience:
                    break
    test_score_list = []
    for batch, subgraph in enumerate(test_dataloader):
        subgraph = subgraph.to(device)
        score, test_loss = evaluate(subgraph.ndata['feat'], model, subgraph, subgraph.ndata['label'], loss_fcn)
        test_score_list.append(score)
    print("Test F1-Score: {:.4f}".format(np.array(test_score_list).mean()))
Ejemplo n.º 24
0
if __name__ == "__main__":
    from dgl.dataloading import GraphDataLoader
    import networkx as nx
    import matplotlib.pyplot as plt
    from tqdm import tqdm

    HighD_dataset = HighD_Dataset(X_len=20,
                                  X_step=1,
                                  Y_len=20,
                                  Y_step=2,
                                  diff=5,
                                  name='data_22',
                                  raw_dir='./')
    HighD_dataloader = GraphDataLoader(HighD_dataset,
                                       batch_size=1,
                                       shuffle=True)
    print("Dataset Ready!")

    with tqdm(total=len(HighD_dataloader)) as pbar:
        for i, (graph, X, Y, mask) in enumerate(HighD_dataloader):
            # if i==1:
            nx.draw(graph.to_networkx(), with_labels=True)
            pbar.set_postfix({"mask_shape": mask.shape})
            pbar.update(1)
        # if (i==466):
        #     print(X["feature"][0,10])
        #     print(mask[0,10])
        #     nx.draw(X["graph"][10].to_networkx(), with_labels=True)
        #     plt.show()
Ejemplo n.º 25
0
def main(args):
    # Step 1: Prepare graph data and retrieve train/validation/test index ============================= #
    # Load dataset
    train_dataset = PPIDataset(mode='train')
    valid_dataset = PPIDataset(mode='valid')
    test_dataset = PPIDataset(mode='test')
    train_dataloader = GraphDataLoader(train_dataset,
                                       batch_size=args.batch_size)
    valid_dataloader = GraphDataLoader(valid_dataset,
                                       batch_size=args.batch_size)
    test_dataloader = GraphDataLoader(test_dataset, batch_size=args.batch_size)

    # check cuda
    if args.gpu >= 0 and th.cuda.is_available():
        device = 'cuda:{}'.format(args.gpu)
    else:
        device = 'cpu'

    num_classes = train_dataset.num_labels

    # Extract node features
    graph = train_dataset[0]
    feat = graph.ndata['feat']

    # Step 2: Create model =================================================================== #
    if args.lazy:
        model = GeniePathLazy(in_dim=feat.shape[-1],
                              out_dim=num_classes,
                              hid_dim=args.hid_dim,
                              num_layers=args.num_layers,
                              num_heads=args.num_heads,
                              residual=args.residual)
    else:
        model = GeniePath(in_dim=feat.shape[-1],
                          out_dim=num_classes,
                          hid_dim=args.hid_dim,
                          num_layers=args.num_layers,
                          num_heads=args.num_heads,
                          residual=args.residual)

    model = model.to(device)

    # Step 3: Create training components ===================================================== #
    loss_fn = th.nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Step 4: training epochs =============================================================== #
    for epoch in range(args.max_epoch):
        model.train()
        tr_loss = 0
        tr_f1 = 0
        num_blocks = 0
        for subgraph in train_dataloader:
            subgraph = subgraph.to(device)
            label = subgraph.ndata['label']
            feat = subgraph.ndata['feat']
            logits = model(subgraph, feat)

            # compute loss
            batch_loss = loss_fn(logits, label)
            tr_loss += batch_loss.item()
            tr_predict = np.where(logits.data.cpu().numpy() >= 0., 1, 0)
            tr_f1 += f1_score(label.cpu(), tr_predict, average='micro')
            num_blocks += 1

            # backward
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

        # validation
        model.eval()
        val_f1, val_loss = evaluate(model, loss_fn, valid_dataloader, device)

        print(
            "In epoch {}, Train F1: {:.4f} | Train Loss: {:.4f}; Valid F1: {:.4f} | Valid loss: {:.4f}"
            .format(epoch, tr_f1 / num_blocks, tr_loss / num_blocks, val_f1,
                    val_loss))

    # Test after all epoch
    model.eval()
    test_f1, test_loss = evaluate(model, loss_fn, test_dataloader, device)

    print("Test F1: {:.4f} | Test loss: {:.4f}".format(test_f1, test_loss))
Ejemplo n.º 26
0
###############################################################################
# Setup and training
# ------------------
# Create a synthetic dataset of :math:`400` graphs with :math:`10` ~
# :math:`20` nodes. :math:`320` graphs constitute a training set and
# :math:`80` graphs constitute a test set.

import torch.optim as optim
from dgl.dataloading import GraphDataLoader

# Create training and test sets.
trainset = MiniGCDataset(320, 10, 20)
testset = MiniGCDataset(80, 10, 20)
# Use DGL's GraphDataLoader. It by default handles the
# graph batching operation for every mini-batch.
data_loader = GraphDataLoader(trainset, batch_size=32, shuffle=True)

# Create model
model = Classifier(1, 256, trainset.num_classes)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()

epoch_losses = []
for epoch in range(80):
    epoch_loss = 0
    for iter, (bg, label) in enumerate(data_loader):
        prediction = model(bg)
        loss = loss_func(prediction, label)
        optimizer.zero_grad()
        loss.backward()
Ejemplo n.º 27
0
    val_idx = all_idx[:val_num]
    test_idx = all_idx[val_num : val_num + test_num]
    train_idx = all_idx[val_num + test_num : val_num + test_num + args.train_num]

    train_data = Subset(dataset, train_idx)
    val_data = Subset(dataset, val_idx)
    test_data = Subset(dataset, test_idx)

    unsup_idx = all_idx[val_num + test_num:]
    unsup_data = Subset(dataset, unsup_idx)

    # generate supervised training dataloader and unsupervised training dataloader
    train_loader = GraphDataLoader(train_data,
                                   batch_size=args.batch_size,
                                   collate_fn=collate,
                                   drop_last=False,
                                   shuffle=True)

    unsup_loader = GraphDataLoader(unsup_data,
                                   batch_size=args.batch_size,
                                   collate_fn=collate,
                                   drop_last=False,
                                   shuffle=True)

    # generate validation & testing dataloader

    val_loader = GraphDataLoader(val_data,
                                 batch_size=args.val_batch_size,
                                 collate_fn=collate,
                                 drop_last=False,
Ejemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser(description='ENZYMES')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--num_workers', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--num_layers', type=int, default=4)
    parser.add_argument('--hidden_size', type=int, default=128)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--eval',
                        action='store_true',
                        help='If not set, we will only do the training part.')
    parser.add_argument('--eval_batch_size', type=int, default=2048)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = LegacyTUDataset('ENZYMES')
    num_samples = len(dataset)
    indices = np.arange(num_samples)
    np.random.seed(42)
    np.random.shuffle(indices)

    train_set = dgl.data.utils.Subset(dataset,
                                      indices[:int(num_samples * 0.8)])
    val_set = dgl.data.utils.Subset(
        dataset, indices[int(num_samples * 0.8):int(num_samples * 0.9)])
    test_set = dgl.data.utils.Subset(
        dataset, indices[int(num_samples * 0.9):int(num_samples)])

    train_loader = GraphDataLoader(train_set,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers)
    val_loader = GraphDataLoader(val_set,
                                 batch_size=args.eval_batch_size,
                                 shuffle=True,
                                 num_workers=0)
    test_loader = GraphDataLoader(test_set,
                                  batch_size=args.eval_batch_size,
                                  shuffle=True,
                                  num_workers=0)

    model = GCN(18,
                args.hidden_size,
                num_classes=int(dataset.num_labels),
                num_layers=args.num_layers,
                dropout=args.dropout).to(device)

    logger = Logger(args.runs, args)
    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            t0 = time.time()
            loss = train(model, device, train_loader, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))

            if not args.eval:
                continue

            val_acc = test(model, device, val_loader)
            test_acc = test(model, device, test_loader)
            logger.add_result(run, (0.0, val_acc, test_acc))

            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Valid: {val_acc * 100:.4f}% '
                      f'Test: {test_acc * 100:.4f}%')

        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
Ejemplo n.º 29
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-MolHiv')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--num_layers', type=int, default=5)
    parser.add_argument('--emb_dim', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--eval',
                        action='store_true',
                        help='If not set, we will only do the training part.')
    parser.add_argument('--eval_batch_size', type=int, default=2048)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(name='ogbg-molhiv')
    train_loader = GraphDataLoader(dataset[split_idx["train"]],
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers)
    val_loader = GraphDataLoader(dataset[split_idx["valid"]],
                                 batch_size=args.eval_batch_size,
                                 shuffle=True,
                                 num_workers=0)
    test_loader = GraphDataLoader(dataset[split_idx["test"]],
                                  batch_size=args.eval_batch_size,
                                  shuffle=True,
                                  num_workers=0)

    model = GCN(args.emb_dim,
                num_classes=dataset.num_tasks,
                num_layers=args.num_layers,
                dropout=args.dropout).to(device)

    logger = Logger(args.runs, args)
    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            t0 = time.time()
            loss = train(model, device, train_loader, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))

            if not args.eval:
                continue

            val_rocauc = test(model, device, val_loader,
                              evaluator)[dataset.eval_metric]
            test_rocauc = test(model, device, test_loader,
                               evaluator)[dataset.eval_metric]
            logger.add_result(run, (0.0, val_rocauc, test_rocauc))

            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Valid: {val_rocauc:.4f} '
                      f'Test: {test_rocauc:.4f}')

        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
Ejemplo n.º 30
0
def main(args):
    data = FB15k237Dataset(reverse=False)
    graph = data[0]
    num_nodes = graph.num_nodes()
    num_rels = data.num_rels

    train_g, test_g = preprocess(graph, num_rels)
    test_nids = th.arange(0, num_nodes)
    test_mask = graph.edata['test_mask']
    subg_iter = SubgraphIterator(train_g, num_rels, args.edge_sampler)
    dataloader = GraphDataLoader(subg_iter,
                                 batch_size=1,
                                 collate_fn=lambda x: x[0])

    # Prepare data for metric computation
    src, dst = graph.edges()
    triplets = th.stack([src, graph.edata['etype'], dst], dim=1)

    model = LinkPredict(num_nodes, num_rels)
    optimizer = th.optim.Adam(model.parameters(), lr=1e-2)

    if args.gpu >= 0 and th.cuda.is_available():
        device = th.device(args.gpu)
    else:
        device = th.device('cpu')
    model = model.to(device)

    best_mrr = 0
    model_state_file = 'model_state.pth'
    for epoch, batch_data in enumerate(dataloader):
        model.train()

        g, train_nids, edges, labels = batch_data
        g = g.to(device)
        train_nids = train_nids.to(device)
        edges = edges.to(device)
        labels = labels.to(device)

        embed = model(g, train_nids)
        loss = model.get_loss(embed, edges, labels)
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(),
                                 max_norm=1.0)  # clip gradients
        optimizer.step()

        print("Epoch {:04d} | Loss {:.4f} | Best MRR {:.4f}".format(
            epoch, loss.item(), best_mrr))

        if (epoch + 1) % 500 == 0:
            # perform validation on CPU because full graph is too large
            model = model.cpu()
            model.eval()
            print("start eval")
            embed = model(test_g, test_nids)
            mrr = calc_mrr(embed,
                           model.w_relation,
                           test_mask,
                           triplets,
                           batch_size=500,
                           eval_p=args.eval_protocol)
            # save best model
            if best_mrr < mrr:
                best_mrr = mrr
                th.save({
                    'state_dict': model.state_dict(),
                    'epoch': epoch
                }, model_state_file)

            model = model.to(device)

    print("Start testing:")
    # use best model checkpoint
    checkpoint = th.load(model_state_file)
    model = model.cpu()  # test on CPU
    model.eval()
    model.load_state_dict(checkpoint['state_dict'])
    print("Using best epoch: {}".format(checkpoint['epoch']))
    embed = model(test_g, test_nids)
    calc_mrr(embed,
             model.w_relation,
             test_mask,
             triplets,
             batch_size=500,
             eval_p=args.eval_protocol)