Ejemplo n.º 1
0
def train(model,
          num_epochs,
          dataset,
          train_loader,
          optimizer,
          criterion,
          device="cpu",
          k=5,
          all_loader=None):
    history = {'train': [], 'val': [], 'test': []}
    for epoch in range(num_epochs):
        model.train()

        for batch_size, n_id, adjs in train_loader:
            # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
            adjs = [adj.to(device) for adj in adjs]

            num_users_per_res, users = dataset.get_visited_users(
                n_id[:batch_size], k)
            unique_users, inverse_idx = np.unique(users, return_inverse=True)
            unique_users = torch.LongTensor(unique_users)

            user_loader = NeighborSampler(dataset.user_pyg_graph.edge_index,
                                          node_idx=unique_users,
                                          sizes=[-1, -1],
                                          batch_size=unique_users.shape[0],
                                          shuffle=False)

            for num_users, u_id, u_adjs in user_loader:
                # this is actually just one loop: using for loop since
                # next(iter(user_loader)) appears to be buggy
                u_adjs = [adj.to(device) for adj in u_adjs]

            del user_loader

            optimizer.zero_grad()
            out = model(dataset.res_x[n_id], adjs, dataset.user_x[u_id],
                        u_adjs, inverse_idx, num_users_per_res)
            # out = model(dataset.res_x[n_id], adjs)
            label = dataset.labels[n_id[:batch_size]]
            loss = criterion(out, label)
            loss.backward()
            optimizer.step()

        if all_loader is not None:
            train_acc, val_acc, test_acc = test(model,
                                                dataset,
                                                all_loader,
                                                device,
                                                k=k)
            print(
                f'epoch: {epoch + 1}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, '
                f'Test: {test_acc:.4f}')
            history['train'].append(train_acc)
            history['val'].append(val_acc)
            history['test'].append(test_acc)

    return history
def main():
    dataset = 'ogb'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    num_layers = 3
    sizes = [10 for _ in range(num_layers)]
    hidden_channels = 256
    epochs = 200
    lr = 0.01
    log_steps = 1
    batch_size = 1024 * 4

    data = Dataset(root=Path('../../dataset'), name=dataset)
    train_loader = NeighborSampler(data.adj_t,
                                   sizes=sizes,
                                   node_idx=data.split_idx['train'],
                                   batch_size=batch_size,
                                   shuffle=True)
    subgraph_loader = NeighborSampler(data.adj_t,
                                      sizes=[-1, -1, -1],
                                      node_idx=None,
                                      batch_size=batch_size,
                                      shuffle=False)

    model = Graphsage(num_layers, data.x.shape[1], hidden_channels,
                      data.num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    test_scores = []
    for epoch in range(epochs + 1):
        loss = train(model, data, train_loader, optimizer, device)
        result = test(model, data, subgraph_loader, device)

        if epoch % log_steps == 0:
            train_res, valid_res, test_res = result
            test_scores.append(test_res)
            print(
                f'Run: {1:02d}, '
                f'Epoch: {epoch:02d}, '
                f'Loss: {loss:.4f}, '
                f'Metric: {data.metric}', f'Train: {100 * train_res:.2f}%, '
                f'Valid: {100 * valid_res:.2f}% '
                f'Test: {100 * test_res:.2f}%')
    print(f"Best test accuracy: {max(test_scores) * 100:.2f}%")
Ejemplo n.º 3
0
 def test_dataloader(self):
     return NeighborSampler(self.data.adj_t,
                            node_idx=self.data.test_mask,
                            sizes=[25, 10],
                            return_e_id=False,
                            transform=self.convert_batch,
                            batch_size=2048,
                            num_workers=3,
                            persistent_workers=True)
Ejemplo n.º 4
0
 def test_dataloader(self, batch_size=32, transforms=None):
     return NeighborSampler(
         self.data.adj_t[self.split_idx["test"]],
         node_idx=self.split_idx["test"],
         sizes=[-1, 10],
         batch_size=batch_size,
         shuffle=True,
         num_workers=self._num_workers,
     )
Ejemplo n.º 5
0
 def train_dataloader(self):
     return NeighborSampler(self.adj_t,
                            node_idx=self.train_idx,
                            sizes=self.sizes,
                            return_e_id=False,
                            transform=self.convert_batch,
                            batch_size=self.batch_size,
                            shuffle=True,
                            num_workers=4)
Ejemplo n.º 6
0
def train_model_and_save_embeddings(dataset, data, epochs, learning_rate,
                                    device):
    # Define Model
    encoder = EmbeddingEncoder(emb_dim=200,
                               out_channels=64,
                               n_nodes=dataset.num_nodes).to(device)

    decoder = CosineSimDecoder().to(device)

    model = VGAE(encoder=encoder, decoder=decoder).to(device)

    node_features, train_pos_edge_index = data.x.to(
        device), data.edge_index.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # data.edge_index = data.edge_index.long()

    assert data.edge_index.max().item() < dataset.num_nodes

    data_loader = NeighborSampler(data,
                                  size=[25, 10],
                                  num_hops=2,
                                  batch_size=10000,
                                  shuffle=False,
                                  add_self_loops=False)

    model.train()

    for epoch in tqdm(range(epochs)):
        epoch_loss = 0.0
        for data_flow in tqdm(data_loader()):
            optimizer.zero_grad()

            data_flow = data_flow.to(device)
            block = data_flow[0]
            embeddings = model.encode(
                node_features[block.n_id], block.edge_index
            )  # TODO Avoid computation of all node features!

            loss = model.recon_loss(embeddings, block.edge_index)
            loss = loss + (1 / len(block.n_id)) * model.kl_loss()

            epoch_loss += loss.item()

            # Compute gradients
            loss.backward()
            # Perform optimization step
            optimizer.step()

        z = model.encode(node_features, train_pos_edge_index)

        torch.save(z.cpu(), "large_emb.pt")

        print(f"Loss after epoch {epoch} / {epochs}: {epoch_loss}")

    return model
    def get_neighbor_sampler(self, train_sizes, b1=50, b2=1000, nworkers=50):
        """Neighbor sampler for this data
        """

        # neighbor sampler for training
        self.train_loader = NeighborSampler(self.edge_index,
                                            node_idx=None,
                                            sizes=train_sizes,
                                            batch_size=b1,
                                            num_workers=nworkers,
                                            shuffle=False)

        # neighbor sampler for testing (single-layer sampler)
        self.test_loader = NeighborSampler(self.edge_index,
                                           node_idx=None,
                                           sizes=[-1],
                                           batch_size=b2,
                                           num_workers=nworkers,
                                           shuffle=False)
Ejemplo n.º 8
0
 def test_dataloader(self):  # Test best validation model once again.
     return NeighborSampler(
         self.adj_t,
         node_idx=self.val_idx,
         sizes=self.sizes,
         return_e_id=False,
         transform=self.convert_batch,
         batch_size=self.batch_size,
         num_workers=2,
     )
Ejemplo n.º 9
0
    def __init__(self,
                 dataset,
                 neighbor_sizes,
                 node_types=None,
                 metapaths=None,
                 head_node_type=None,
                 directed=True,
                 resample_train=None,
                 add_reverse_metapaths=True,
                 inductive=False):
        self.neighbor_sizes = neighbor_sizes
        super(HeteroNeighborSampler,
              self).__init__(dataset, node_types, metapaths, head_node_type,
                             directed, resample_train, add_reverse_metapaths,
                             inductive)

        if self.use_reverse:
            self.add_reverse_edge_index(self.edge_index_dict)

        # Ensure head_node_type is first item in num_nodes_dict, since NeighborSampler.sample() function takes in index only the first
        num_nodes_dict = OrderedDict([(node_type,
                                       self.num_nodes_dict[node_type])
                                      for node_type in self.node_types])

        self.edge_index, self.edge_type, self.node_type, self.local_node_idx, self.local2global, self.key2int = \
            group_hetero_graph(self.edge_index_dict, num_nodes_dict)

        self.int2node_type = {
            type_int: node_type
            for node_type, type_int in self.key2int.items()
            if node_type in self.node_types
        }
        self.int2edge_type = {
            type_int: edge_type
            for edge_type, type_int in self.key2int.items()
            if edge_type in self.edge_index_dict
        }

        self.neighbor_sampler = NeighborSampler(self.edge_index,
                                                node_idx=self.training_idx,
                                                sizes=self.neighbor_sizes,
                                                batch_size=128,
                                                shuffle=True)
Ejemplo n.º 10
0
 def hidden_test_dataloader(self):
     return NeighborSampler(
         self.adj_t,
         node_idx=self.test_idx,
         sizes=self.sizes,
         return_e_id=False,
         transform=self.convert_batch,
         batch_size=self.batch_size,
         num_workers=3,
     )
Ejemplo n.º 11
0
 def train_dataloader(self):
     return NeighborSampler(self.data.adj_t,
                            node_idx=self.data.train_mask,
                            sizes=[25, 10],
                            return_e_id=False,
                            transform=self.convert_batch,
                            batch_size=1024,
                            shuffle=True,
                            num_workers=6,
                            persistent_workers=True)
Ejemplo n.º 12
0
 def create_neighbor_sampler(self, batch_size=2, stage=None):
     return NeighborSampler(
         self.data.edge_index,
         # the nodes that should be considered for sampling.
         node_idx=getattr(self.data, f"{stage}_mask"),
         # -1 indicates all neighbors will be selected
         sizes=[self._num_layers, -1],
         num_workers=self._num_workers,
         drop_last=self._drop_last,
         pin_memory=self._pin_memory,
     )
Ejemplo n.º 13
0
def forward(model, i, data):
    # 改成用 geometric的Data格式
    items, targets, mask, batch, seq = data.x, data.y, data.sequence_mask, data.batch, data.sequence
    seq = seq.view(targets.shape[0], -1)
    mask = mask.view(targets.shape[0], -1)

    A = []
    # datas = data.to_data_list()
    # graphs = [to_networkx(d) for d in datas]
    # A = [nx.convert_matrix.to_pandas_adjacency(g).values for g in graphs]  # 無向圖adj = in + out
    # A_out = [g for g in graphs]  # 有向圖的adj就是A_out

    # todo 解決cpu usage高的問題
    # global graph
    gg = model.global_data
    gg_edge_index = gg.edge_index
    # 直接對 batch下所有node做NeighborSample
    batch_nodes = seq.flatten()
    # batch_nodes = torch.unique(batch_nodes)  # 取unique node in batch sessions
    # batch_nodes = batch_nodes[batch_nodes!=0]  # 移除padding node id
    # sample as whole batch, 從大graph中找session graph內的node id的鄰居
    # subgraph_loaders = NeighborSampler(gg_edge_index, node_idx=batch_nodes, sizes=[-1], shuffle=False, num_workers=0, batch_size=batch_nodes.shape[0])  # all neighbors
    # fixme 放全部node
    subgraph_loaders = NeighborSampler(
        gg_edge_index,
        node_idx=batch_nodes,
        sizes=[10, 5],
        shuffle=False,
        num_workers=0,
        batch_size=batch_nodes.shape[0])  # 2 hop

    hidden, pad, g_h = model(items, A, data.edge_index,
                             subgraph_loaders)  # session graph node embeddings
    # 推回原始序列
    sections = torch.bincount(batch).cpu().numpy()
    # split whole x back into graphs G_i
    hidden = torch.split(hidden, tuple(sections))

    # todo 增加不考慮padding的選項
    mask_true = True
    if mask_true:
        leng = mask.shape[1]  # padding完的session長度
        alias_inputs = data.alias_inputs
        s_len = data.sequence_len.cpu().numpy().tolist()
        alias_inputs = torch.split(alias_inputs, s_len)
        seq_hidden = torch.stack([
            get(pad, i, hidden, alias_inputs, leng)
            for i in torch.arange(len(alias_inputs)).long()
        ])
        g_h = g_h.view([len(hidden), leng, -1])
    else:
        seq_hidden = hidden
    seq_hidden += g_h
    return targets, model.compute_scores(seq_hidden, mask, mask_true)
Ejemplo n.º 14
0
    def make_graph_sampler(self):
        graph = Data(
            edge_index=self.edge_index, edge_attr=self.edge_weight, num_nodes=self.num_nodes
        ) .to('cpu')

        graph_sampler = NeighborSampler(
            # graph, size=[5, 5], num_hops=2, batch_size=100, shuffle=self.shuffle, add_self_loops=True
            graph, size=[10, 15], num_hops=2, batch_size=250, shuffle=self.shuffle, add_self_loops=True
        )

        return graph_sampler
Ejemplo n.º 15
0
    def fit(self, model, data):
        data = data[0]
        self.model = model.to(self.device)
        self.data = data
        self.test_gpu_volume()
        self.subgraph_loader = NeighborSampler(
            data.edge_index,
            sizes=[
                -1,
            ],
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=10,
        )

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.lr,
                                          weight_decay=self.weight_decay)
        self.edge_index, _ = add_remaining_self_loops(
            data.edge_index,
            torch.ones(data.edge_index.shape[1]).to(data.x.device), 1,
            data.x.shape[0])
        self.train_index = torch.where(data.train_mask)[0].tolist()

        epoch_iter = tqdm(range(self.max_epoch))
        patience = 0
        best_score = 0
        best_loss = np.inf
        max_score = 0
        min_loss = np.inf
        for epoch in epoch_iter:
            self._train_step()
            if epoch % 5 == 0:
                val_acc, val_loss = self._test_step(split="val")
                epoch_iter.set_description(
                    f"Epoch: {epoch:03d}, Val: {val_acc:.4f}")
                if val_loss <= min_loss or val_acc >= max_score:
                    if val_acc >= best_score:  # SAINT loss is not accurate
                        best_loss = val_loss
                        best_score = val_acc
                        best_model = copy.deepcopy(self.model)
                    min_loss = np.min((min_loss, val_loss))
                    max_score = np.max((max_score, val_acc))
                    patience = 0
                else:
                    patience += 1
                    if patience == self.patience:
                        self.model = best_model
                        epoch_iter.close()
                        break
        test_acc, _ = self._test_step(split="test")
        val_acc, _ = self._test_step(split="val")
        print(f"Test accuracy = {test_acc}")
        return dict(Acc=test_acc, ValAcc=val_acc)
 def create_neighbor_sampler(self,
                             batch_size=2,
                             transforms=None,
                             stage=None):
     return NeighborSampler(
         self.data.edge_index,
         node_idx=getattr(self.data, f"{stage}_mask"),
         sizes=self._sizes,
         num_workers=self._num_workers,
         drop_last=self._drop_last,
         pin_memory=self._pin_memory,
     )
 def create_loader_from_cls(self, loader_cls=None, params=None, stage=None):
     if not hasattr(self, "_loaded_dataset"):
         self._loaded_dataset = loader_cls(self.data, **params)
     dataset = Batch.from_data_list([d for d in self._loaded_dataset])
     return NeighborSampler(
         dataset.edge_index,
         node_idx=getattr(dataset, f"{stage}_mask"),
         sizes=self._sizes,
         num_workers=self._num_workers,
         drop_last=self._drop_last,
         pin_memory=self._pin_memory,
     )
Ejemplo n.º 18
0
 def create_neighbor_sampler(self, batch_size=2, stage=None):
     # https://github.com/rusty1s/pytorch_geometric/tree/master/torch_geometric/data/sampler.py#L18
     return NeighborSampler(
         self.data.edge_index,
         # the nodes that should be considered for sampling.
         node_idx=getattr(self.data, f"{stage}_mask"),
         # -1 indicates all neighbors will be selected
         sizes=[self._num_layers, -1],
         num_workers=self._num_workers,
         drop_last=self._drop_last,
         pin_memory=self._pin_memory,
     )
Ejemplo n.º 19
0
def test_sampler():
    num_nodes = 10
    data = Data(edge_index=erdos_renyi_graph(num_nodes, 0.1))
    data.num_nodes = num_nodes

    loader = NeighborSampler(data,
                             size=[4, 0.5],
                             num_hops=2,
                             batch_size=2,
                             shuffle=True)

    for data_flow in loader():
        assert data_flow.__repr__()[:8] == 'DataFlow'
        assert data_flow.n_id.size() == (2, )
        assert data_flow.batch_size == 2
        assert len(data_flow) == 2
        block = data_flow[0]
        assert block.__repr__()[:5] == 'Block'
        for block in data_flow:
            pass
        data_flow = data_flow.to(torch.long)
        break
    for data_flow in loader(torch.tensor([0, 1, 2, 3, 4])):
        pass

    loader = NeighborSampler(data,
                             size=[4, 0.5],
                             num_hops=2,
                             batch_size=3,
                             drop_last=True,
                             shuffle=False,
                             add_self_loops=True)

    for data_flow in loader():
        pass
    for data_flow in loader(torch.tensor([0, 1, 2, 3, 4])):
        pass
    mask = torch.tensor([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=torch.uint8)
    for data_flow in loader(mask):
        pass
def test_sampler():
    torch.manual_seed(12345)
    edge_index = erdos_renyi_graph(num_nodes=10, edge_prob=0.5)
    E = edge_index.size(1)

    loader = NeighborSampler(edge_index, sizes=[2, 4], batch_size=2)
    assert loader.__repr__() == 'NeighborSampler(sizes=[2, 4])'
    assert len(loader) == 5

    for batch_size, n_id, adjs in loader:
        assert batch_size == 2
        assert all(np.isin(n_id, torch.arange(10)).tolist())
        assert n_id.unique().size(0) == n_id.size(0)
        for (edge_index, e_id, size) in adjs:
            assert int(edge_index[0].max() + 1) <= size[0]
            assert int(edge_index[1].max() + 1) <= size[1]
            assert all(np.isin(e_id, torch.arange(E)).tolist())
            assert e_id.unique().size(0) == e_id.size(0)
            assert size[0] >= size[1]

    out = loader.sample([1, 2])
    assert len(out) == 3
def train(model, num_epochs, dataset, optimizer, criterion, device="cpu", k=5):
    for epoch in range(num_epochs):
        model.train()

        for target_bus, business_ids, distances, business_lens, labels in dataset.get_batch(
                2048, "train"):
            num_users_per_res, users = dataset.get_visited_users(target_bus, k)
            unique_users, inverse_idx = np.unique(users, return_inverse=True)
            unique_users = torch.LongTensor(unique_users)

            user_loader = NeighborSampler(dataset.user_pyg_graph.edge_index,
                                          node_idx=unique_users,
                                          sizes=[-1, -1],
                                          batch_size=unique_users.shape[0],
                                          shuffle=False)

            for _, u_id, u_adjs in user_loader:
                # this is actually just one loop: using for loop since
                # next(iter(user_loader)) appears to be buggy
                u_adjs = [adj.to(device) for adj in u_adjs]

            del user_loader

            business_ids = business_ids.to(device)
            business_lens = business_lens.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            out = model(dataset.res_x,
                        target_bus,
                        business_ids,
                        distances,
                        business_lens,
                        dataset,
                        device,
                        user_x=dataset.user_x[u_id],
                        u_adjs=u_adjs,
                        inverse_idx=inverse_idx,
                        num_users_per_res=num_users_per_res)
            loss = criterion(out, labels)
            loss.backward()
            optimizer.step()

        if epoch % 5 == 0:
            train_acc = test(model, dataset, device, "train", k=k)
            val_acc = test(model, dataset, device, "val", k=k)
            test_acc = test(model, dataset, device, "test", k=k)
            print(
                f'epoch: {epoch}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, '
                f'Test: {test_acc:.4f}')
Ejemplo n.º 22
0
def _run_trainer():
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                    'Reddit')
    print("Load Dataset")
    dataset = Reddit(path)
    data = dataset[0]
    print("Load Train Sampler")
    train_loader = NeighborSampler(data.edge_index,
                                   node_idx=data.train_mask,
                                   sizes=[25, 10],
                                   batch_size=1024,
                                   shuffle=True,
                                   num_workers=12)
    print("Load subgraph sampler")
    subgraph_loader = NeighborSampler(data.edge_index,
                                      node_idx=None,
                                      sizes=[-1],
                                      batch_size=1024,
                                      shuffle=False,
                                      num_workers=12)

    print("Creating SAGE model")
    model = SAGE('ps', dataset.num_features, 256, dataset.num_classes,
                 subgraph_loader)

    optimizer = DistributedOptimizer(torch.optim.Adam,
                                     model.parameter_rrefs(),
                                     lr=0.01)

    print("Start training")
    for epoch in range(1, 11):
        loss, acc = train(model, optimizer, epoch, data, train_loader)
        print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Approx. Train: {acc:.4f}')
        train_acc, val_acc, test_acc = test(model, data)
        print(f'Train: {train_acc:.4f}, Val: {val_acc:.4f}, '
              f'Test: {test_acc:.4f}')
def test(model, dataset, device, mode, k=5):
    outs = []
    model.eval()
    y_true = []
    for target_bus, business_ids, distances, business_lens, labels in dataset.get_batch(
            2048, mode):

        num_users_per_res, users = dataset.get_visited_users(target_bus, k)
        unique_users, inverse_idx = np.unique(users, return_inverse=True)
        unique_users = torch.LongTensor(unique_users)

        user_loader = NeighborSampler(dataset.user_pyg_graph.edge_index,
                                      node_idx=unique_users,
                                      sizes=[-1, -1],
                                      batch_size=unique_users.shape[0],
                                      shuffle=False)

        for _, u_id, u_adjs in user_loader:
            # this is actually just one loop: using for loop since
            # next(iter(user_loader)) appears to be buggy
            u_adjs = [adj.to(device) for adj in u_adjs]

        del user_loader

        business_ids = business_ids.to(device)
        business_lens = business_lens.to(device)

        out = model(dataset.res_x,
                    target_bus,
                    business_ids,
                    distances,
                    business_lens,
                    dataset,
                    device,
                    user_x=dataset.user_x[u_id],
                    u_adjs=u_adjs,
                    inverse_idx=inverse_idx,
                    num_users_per_res=num_users_per_res)
        outs.append(out.cpu())
        y_true.append(labels.cpu())

    outs = torch.cat(outs, dim=0)
    y_true = torch.cat(y_true, dim=0)
    y_pred = outs.argmax(dim=-1, keepdim=False)

    results = int(y_pred.eq(y_true).sum()) / int(y_true.shape[0])

    return results
Ejemplo n.º 24
0
def train_net():
    club = KarateClub()
    data = club.data
    data.num_nodes = data.num_nodes[0]
    print(data)
    data_loader = NeighborSampler(data,
                                  size=[20, 10],
                                  num_hops=2,
                                  batch_size=8,
                                  shuffle=True,
                                  add_self_loops=True)
    net = SAGENet(34, 2)
    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    losses = AverageMeter()
    for i in range(epoch):
        result = []
        for data_flow in data_loader():
            label = data.y[data_flow.n_id]
            # print(label)
            optimizer.zero_grad()
            out = net(data.x, data_flow.to(device))

            # print(out)
            # print(label)
            result.append((label, out))
            # Calculate loss
            loss = criterion(out, label)

            # Back prop.
            optimizer.zero_grad()
            loss.backward()

            # Clip gradients
            clip_gradient(optimizer, grad_clip)

            # Update weights
            optimizer.step()
            losses.update(loss.item())

            # Print status
        print(validation(result))
        if i % print_freq == 0:
            print('Epoch: {0} Loss {loss.val:.5f} ({loss.avg:.5f})\t'.format(
                i, loss=losses))

    return losses.avg
Ejemplo n.º 25
0
    def make_sampler(self, node_idx, force_default=False):
        if node_idx != None:
            node_idx = torch.tensor(node_idx, dtype=torch.long)
            sizes = config["sample_sizes"]
        else:
            sizes = [-1]
        
        if config["unsupervised_loss"] and not force_default:
            if config["unsup_sampling_type"] == 'neighbor':
                
                adj_t = SparseTensor(row=self.graph.edge_index[0], col=self.graph.edge_index[1],
                                        value=self.graph.edge_attr,
                                        sparse_sizes=(self.graph.num_nodes, self.graph.num_nodes)).t()
                return PosNegNeighborSampler(edge_index=adj_t, node_idx=node_idx, sizes=sizes, batch_size=config["sample_batch_size"],
                                 shuffle=True, num_doc_nodes=len(self.docs), edge_weights=self.graph.edge_attr)
            else:
                raise NotImplementedError("[dataset] The unsupervised sampling type %s has not been implemented" % config["unsup_sampling_type"])

        return NeighborSampler(self.graph.edge_index, node_idx=node_idx, sizes=sizes, batch_size=config["sample_batch_size"],
                                 shuffle=True, num_workers=config["sampling_num_workers"])
Ejemplo n.º 26
0
def _run_trainer():
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                    'Reddit')
    print("Load Dataset")
    dataset = Reddit(path)
    data = dataset[0]
    print("Load Train Sampler")
    train_loader = NeighborSampler(data.edge_index,
                                   node_idx=data.train_mask,
                                   sizes=[25, 10],
                                   batch_size=1024,
                                   shuffle=True,
                                   num_workers=0)

    print("Creating SAGE model")
    model = SAGE(dataset.num_features, 128, dataset.num_classes)

    print("Start training")
    for epoch in range(1, 11):
        loss, acc = train(model, epoch, data, train_loader)
        print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Approx. Train: {acc:.4f}')
Ejemplo n.º 27
0
def test(model, dataset, all_loader, device, k=5):
    outs = []
    model.eval()
    for batch_size, n_id, adjs in all_loader:
        adjs = [adj.to(device) for adj in adjs]

        num_users_per_res, users = dataset.get_visited_users(
            n_id[:batch_size], k)
        unique_users, inverse_idx = np.unique(users, return_inverse=True)
        unique_users = torch.LongTensor(unique_users)

        user_loader = NeighborSampler(dataset.user_pyg_graph.edge_index,
                                      node_idx=unique_users,
                                      sizes=[-1, -1],
                                      batch_size=unique_users.shape[0],
                                      shuffle=False)

        for num_users, u_id, u_adjs in user_loader:
            # this is actually just one loop: using for loop since
            # next(iter(user_loader)) appears to be buggy
            u_adjs = [adj.to(device) for adj in u_adjs]

        del user_loader

        out = model(dataset.res_x[n_id], adjs, dataset.user_x[u_id], u_adjs,
                    inverse_idx, num_users_per_res)
        # out = model(dataset.res_x[n_id], adjs)
        outs.append(out.cpu())

    outs = torch.cat(outs, dim=0)

    y_true = dataset.labels.cpu().unsqueeze(-1)
    y_pred = outs.argmax(dim=-1, keepdim=True)

    results = []
    for idx in [dataset.train_index, dataset.val_index, dataset.test_index]:
        results.append(
            int(y_pred[idx].eq(y_true[idx]).sum()) / int(idx.shape[0]))

    return results
Ejemplo n.º 28
0
def detect():
    global model
    global loader
    global data
    global device
    global fp
    global tn
    global id_map_t
    loader = NeighborSampler(data,
                             size=[1.0, 1.0],
                             num_hops=2,
                             batch_size=batch_size,
                             shuffle=True,
                             add_self_loops=True)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    Net = SAGENet

    test_acc = 0
    model = Net(feature_num, label_num).to(device)
    for j in model_list:
        loop_num = 0
        base_model = str(j)
        while (1):
            model_path = '../models/' + base_model + '_' + str(loop_num)
            if not osp.exists(model_path): break
            model.load_state_dict(torch.load(model_path))
            fp = []
            tn = []
            loss, test_acc = test(data.test_mask)
            for i in tn:
                data.test_mask[i] = False
                update_benign(i)
            if test_acc == 1: break
            loop_num += 1
        if test_acc == 1: break

    for i in fp:
        raise_alert(i)
Ejemplo n.º 29
0
def test_cora():
    class Net(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super(Net, self).__init__()
            self.conv1 = SAGEConv(in_channels, 16)
            self.conv2 = SAGEConv(16, 16)
            self.conv3 = SAGEConv(16, out_channels)

        def forward_data_flow(self, x, edge_weight, data_flow):
            block = data_flow[0]
            weight = edge_weight[block.e_id]
            weight[block.e_id == -1] = 1
            x = relu(self.conv1(x, block.edge_index, weight, block.size))
            block = data_flow[1]
            weight = edge_weight[block.e_id]
            weight[block.e_id == -1] = 1
            x = relu(self.conv2(x, block.edge_index, weight, block.size))
            block = data_flow[2]
            weight = edge_weight[block.e_id]
            weight[block.e_id == -1] = 1
            x = self.conv3(x, block.edge_index, weight, block.size)
            return x

        def forward(self, x, edge_index, edge_weight):
            x = relu(self.conv1(x, edge_index, edge_weight))
            x = relu(self.conv2(x, edge_index, edge_weight))
            return self.conv3(x, edge_index, edge_weight)

    root = osp.join('/', 'tmp', str(random.randrange(sys.maxsize)))
    dataset = Planetoid(root, 'Cora')
    model = Net(dataset.num_features, dataset.num_classes)

    data1 = dataset[0]
    data1.edge_weight = torch.rand(data1.num_edges)

    data2 = T.AddSelfLoops()(dataset[0])
    data2.edge_weight = torch.rand(data2.num_edges)

    data3 = dataset[0]
    loop = torch.stack([torch.arange(100, 200), torch.arange(100, 200)], dim=0)
    data3.edge_index = torch.cat([data3.edge_index, loop], dim=1)
    data3.edge_weight = torch.rand(data3.num_edges)

    for data in [data1, data2, data3]:
        out_all = model(data.x, data.edge_index, data.edge_weight)

        loader = NeighborSampler(data, size=1.0, num_hops=3, batch_size=64,
                                 shuffle=False, drop_last=False,
                                 bipartite=True, add_self_loops=True)

        for data_flow in loader(data.train_mask):
            out = model.forward_data_flow(data.x[data_flow[0].n_id],
                                          data.edge_weight, data_flow)
            assert torch.allclose(out_all[data_flow.n_id], out)

        loader = NeighborSampler(data, size=1.0, num_hops=3, batch_size=64,
                                 shuffle=False, drop_last=False,
                                 bipartite=False)

        for subdata in loader(data.train_mask):
            out = model(data.x[subdata.n_id], subdata.edge_index,
                        data.edge_weight[subdata.e_id])
            out = out[subdata.sub_b_id]
            assert torch.allclose(out_all[subdata.b_id], out)

    shutil.rmtree(root)
from torch.nn import Linear as Lin
from tqdm import tqdm
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
from torch_geometric.data import NeighborSampler
from torch_geometric.nn import GATConv

root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'products')
dataset = PygNodePropPredDataset('ogbn-products', root)
split_idx = dataset.get_idx_split()
evaluator = Evaluator(name='ogbn-products')
data = dataset[0]

train_idx = split_idx['train']
train_loader = NeighborSampler(data.edge_index,
                               node_idx=train_idx,
                               sizes=[10, 10, 10],
                               batch_size=512,
                               shuffle=True,
                               num_workers=12)
subgraph_loader = NeighborSampler(data.edge_index,
                                  node_idx=None,
                                  sizes=[-1],
                                  batch_size=1024,
                                  shuffle=False,
                                  num_workers=12)


class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 heads):
        super(GAT, self).__init__()