def test_dataset_property(self):
        _, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = (
            simple_networkx_graph())
        G = Graph(node_feature=x,
                  node_label=y,
                  edge_index=edge_index,
                  edge_feature=edge_x,
                  edge_label=edge_y,
                  graph_feature=graph_x,
                  graph_label=graph_y,
                  directed=True)

        H = deepcopy(G)

        H.graph_label = torch.tensor([1])

        graphs = [G, H]
        dataset = GraphDataset(graphs)
        self.assertEqual(dataset.num_node_labels, 5)
        self.assertEqual(dataset.num_node_features, 2)
        self.assertEqual(dataset.num_edge_labels, 4)
        self.assertEqual(dataset.num_edge_features, 2)
        self.assertEqual(dataset.num_graph_labels, 1)
        self.assertEqual(dataset.num_graph_features, 2)
        self.assertEqual(dataset.num_labels, 5)  # node task
        dataset = GraphDataset(graphs, task="edge")
        self.assertEqual(dataset.num_labels, 4)
        dataset = GraphDataset(graphs, task="link_pred")
        self.assertEqual(dataset.num_labels, 5)
        dataset = GraphDataset(graphs, task="graph")
        self.assertEqual(dataset.num_labels, 1)
Beispiel #2
0
 def test_torch_dataloader_collate(self):
     # graph classification example
     pyg_dataset = TUDataset('./enzymes', 'ENZYMES')
     graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
     dataset = GraphDataset(graphs, task="graph")
     train_batch_num = math.ceil(len(dataset) * 0.8 / 32)
     test_batch_num = math.ceil(len(dataset) * 0.1 / 32)
     val_batch_num = math.ceil(len(dataset) * 0.1 / 32)
     datasets = {}
     datasets['train'], datasets['val'], datasets['test'] = \
         dataset.split(transductive=False, split_ratio=[0.8, 0.1, 0.1])
     dataloaders = {
         split: DataLoader(dataset,
                           collate_fn=Batch.collate(),
                           batch_size=32,
                           shuffle=True)
         for split, dataset in datasets.items()
     }
     self.assertEqual(len(dataloaders['train']), train_batch_num)
     self.assertEqual(len(dataloaders['val']), test_batch_num)
     self.assertEqual(len(dataloaders['test']), val_batch_num)
     for i, data in enumerate(dataloaders['train']):
         if i != len(dataloaders['train']) - 1:
             self.assertEqual(data.num_graphs, 32)
     for i, data in enumerate(dataloaders['val']):
         if i != len(dataloaders['val']) - 1:
             self.assertEqual(data.num_graphs, 32)
     for i, data in enumerate(dataloaders['test']):
         if i != len(dataloaders['test']) - 1:
             self.assertEqual(data.num_graphs, 32)
Beispiel #3
0
    def test_dataset_property(self):
        G, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = (
            simple_networkx_graph()
        )
        Graph.add_edge_attr(G, "edge_feature", edge_x)
        Graph.add_edge_attr(G, "edge_label", edge_y)
        Graph.add_node_attr(G, "node_feature", x)
        Graph.add_node_attr(G, "node_label", y)
        Graph.add_graph_attr(G, "graph_feature", graph_x)
        Graph.add_graph_attr(G, "graph_label", graph_y)
        H = G.copy()
        Graph.add_graph_attr(H, "graph_label", torch.tensor([1]))

        graphs = GraphDataset.list_to_graphs([G, H])
        dataset = GraphDataset(graphs)
        self.assertEqual(dataset.num_node_labels, 5)
        self.assertEqual(dataset.num_node_features, 2)
        self.assertEqual(dataset.num_edge_labels, 4)
        self.assertEqual(dataset.num_edge_features, 2)
        self.assertEqual(dataset.num_graph_labels, 2)
        self.assertEqual(dataset.num_graph_features, 2)
        self.assertEqual(dataset.num_labels, 5)  # node task
        dataset = GraphDataset(graphs, task="edge")
        self.assertEqual(dataset.num_labels, 4)
        dataset = GraphDataset(graphs, task="link_pred")
        self.assertEqual(dataset.num_labels, 4)
        dataset = GraphDataset(graphs, task="graph")
        self.assertEqual(dataset.num_labels, 2)
    def test_resample_disjoint_heterogeneous(self):
        G = generate_dense_hete_dataset()
        hete = HeteroGraph(G)
        hete = HeteroGraph(node_feature=hete.node_feature,
                           node_label=hete.node_label,
                           edge_feature=hete.edge_feature,
                           edge_label=hete.edge_label,
                           edge_index=hete.edge_index,
                           directed=True)
        graphs = [hete]
        dataset = GraphDataset(graphs,
                               task="link_pred",
                               edge_train_mode="disjoint",
                               edge_message_ratio=0.8,
                               resample_disjoint=True,
                               resample_disjoint_period=1)
        dataset_train, _, _ = dataset.split(split_ratio=[0.5, 0.2, 0.3])
        graph_train_first = dataset_train[0]
        graph_train_second = dataset_train[0]

        for message_type in graph_train_first.edge_index:
            self.assertEqual(
                graph_train_first.edge_label_index[message_type].shape[1],
                graph_train_second.edge_label_index[message_type].shape[1])
            self.assertEqual(graph_train_first.edge_label[message_type].shape,
                             graph_train_second.edge_label[message_type].shape)
Beispiel #5
0
def train(rank, pygds, args, num_node_features, num_classes):
    if args.skip is not None:
        model_cls = skip_models.SkipLastGNN
    elif args.model == "GIN":
        model_cls = GIN
    else:
        model_cls = GNN
        
    model = model_cls(num_node_features, args.hidden_dim, num_classes, args).to(device)
    opt = build_optimizer(args, model.parameters())
    
    # DISTRIBUTED TRAINING - can be replaced with 1 call to model_parallelize()
    world_size = torch.cuda.device_count()
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    pyg_dataset[0] = pyg_dataset[0].split(pyg_dataset[0].size(0) // world_size)[rank]
    device = rank
    model = DistributedDataParallel(model)

    graphs = GraphDataset.pyg_to_graphs(pyg_dataset)

    dataset = GraphDataset(graphs, task="graph")
    datasets = {}
    datasets['train'], datasets['val'], datasets['test'] = dataset.split(
            transductive=False, split_ratio = [0.8, 0.1, 0.1])
    def test_resample_disjoint(self):
        pyg_dataset = Planetoid("./cora", "Cora")
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
        graph = graphs[0]
        graph = Graph(node_label=graph.node_label,
                      node_feature=graph.node_feature,
                      edge_index=graph.edge_index,
                      edge_feature=graph.edge_feature,
                      directed=False)
        graphs = [graph]
        dataset = GraphDataset(graphs,
                               task="link_pred",
                               edge_train_mode="disjoint",
                               edge_message_ratio=0.8,
                               resample_disjoint=True,
                               resample_disjoint_period=1)
        dataset_train, _, _ = dataset.split(split_ratio=[0.5, 0.2, 0.3])
        graph_train_first = dataset_train[0]
        graph_train_second = dataset_train[0]

        self.assertEqual(graph_train_first.edge_label_index.shape[1],
                         graph_train_second.edge_label_index.shape[1])
        self.assertTrue(
            torch.equal(graph_train_first.edge_label,
                        graph_train_second.edge_label))
Beispiel #7
0
def load_dataset(format, name, dataset_dir):
    if format != 'NetlistOmitted':
        return None

    dataset_dir = '{}/{}'.format(dataset_dir, name)
    netlists = find_netlists(dataset_dir)
    if cfg.dataset.mean:
        mean = np.load(cfg.dataset.mean)
        stddev = np.load(cfg.dataset.stddev)
        dataset = datasets.omitted(netlists,
                                   min_edge_count=5,
                                   resample=cfg.dataset.resample,
                                   mean=mean,
                                   std=stddev)
    else:
        dataset = datasets.omitted(netlists,
                                   min_edge_count=5,
                                   resample=cfg.dataset.resample)

    graphs = h.to_deepsnap(dataset)

    dataset = GraphDataset(
        graphs,
        task=cfg.dataset.task,
        edge_train_mode=cfg.dataset.edge_train_mode,
        edge_message_ratio=cfg.dataset.edge_message_ratio,
        edge_negative_sampling_ratio=cfg.dataset.edge_negative_sampling_ratio,
        resample_disjoint=cfg.dataset.resample_disjoint,
        minimum_node_per_graph=0)
    dataset._num_graph_labels = len(datasets.helpers.component_types)
    return dataset
Beispiel #8
0
def deepsnap_ego(args, pyg_dataset):
    avg_time = 0
    task = "graph"
    for i in range(args.num_runs):
        if args.print_run:
            print("Run {}".format(i + 1))
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset,
                                            verbose=True,
                                            netlib=netlib)
        dataset = GraphDataset(graphs, task=task)
        datasets = {}
        datasets['train'], datasets['val'], datasets['test'] = dataset.split(
            transductive=False, split_ratio=[0.8, 0.1, 0.1], shuffle=False)
        dataloaders = {
            split: DataLoader(dataset,
                              collate_fn=Batch.collate(),
                              batch_size=1,
                              shuffle=False)
            for split, dataset in datasets.items()
        }
        s = time.time()
        for batch in dataloaders['train']:
            batch = batch.apply_transform(ego_nets, update_tensor=True)
        avg_time += (time.time() - s)
    print("DeepSNAP has average time: {}".format(avg_time / args.num_runs))
Beispiel #9
0
def main():
    args = arg_parse()

    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    G = nx.read_gpickle(args.data_path)
    print(G.number_of_edges())
    print('Each node has node ID (n_id). Example: ', G.nodes[0])
    print(
        'Each edge has edge ID (id) and categorical label (e_label). Example: ',
        G[0][5871])

    # find num edge types
    max_label = 0
    labels = []
    for u, v, edge_key in G.edges:
        l = G[u][v][edge_key]['e_label']
        if not l in labels:
            labels.append(l)
    # labels are consecutive (0-17)
    num_edge_types = len(labels)

    H = WN_transform(G, num_edge_types)
    # The nodes in the graph have the features: node_feature and node_type (just one node type "n1" here)
    for node in H.nodes(data=True):
        print(node)
        break
    # The edges in the graph have the features: edge_feature and edge_type ("0" - "17" here)
    for edge in H.edges(data=True):
        print(edge)
        break

    hete = HeteroGraph(H)

    dataset = GraphDataset([hete], task='link_pred')
    dataset_train, dataset_val, dataset_test = dataset.split(
        transductive=True, split_ratio=[0.8, 0.1, 0.1])
    train_loader = DataLoader(dataset_train,
                              collate_fn=Batch.collate(),
                              batch_size=1)
    val_loader = DataLoader(dataset_val,
                            collate_fn=Batch.collate(),
                            batch_size=1)
    test_loader = DataLoader(dataset_test,
                             collate_fn=Batch.collate(),
                             batch_size=1)
    dataloaders = {
        'train': train_loader,
        'val': val_loader,
        'test': test_loader
    }

    hidden_size = 32
    model = HeteroNet(hete, hidden_size, 0.2).to(args.device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.001,
                                 weight_decay=5e-4)

    train(model, dataloaders, optimizer, args)
Beispiel #10
0
def main():
    args = arg_parse()

    pyg_dataset = Planetoid('./cora', 'Cora', transform=T.TargetIndegree())
    
    # the input that we assume users have
    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    graphs = GraphDataset.pyg_to_graphs(pyg_dataset, tensor_backend=True)
    if args.multigraph:
        graphs = [copy.deepcopy(graphs[0]) for _ in range(10)]

    dataset = GraphDataset(graphs, 
                           task='link_pred', 
                           edge_message_ratio=args.edge_message_ratio, 
                           edge_train_mode=edge_train_mode)
    print('Initial dataset: {}'.format(dataset))

    # split dataset
    datasets = {}
    datasets['train'], datasets['val'], datasets['test']= dataset.split(
            transductive=not args.multigraph, split_ratio=[0.85, 0.05, 0.1])

    print('after split')
    print('Train message-passing graph: {} nodes; {} edges.'.format(
            datasets['train'][0].num_nodes,
            datasets['train'][0].num_edges))
    print('Val message-passing graph: {} nodes; {} edges.'.format(
            datasets['val'][0].num_nodes,
            datasets['val'][0].num_edges))
    print('Test message-passing graph: {} nodes; {} edges.'.format(
            datasets['test'][0].num_nodes,
            datasets['test'][0].num_edges))


    # node feature dimension
    input_dim = datasets['train'].num_node_features
    # link prediction needs 2 classes (0, 1)
    num_classes = datasets['train'].num_edge_labels

    model = Net(input_dim, num_classes, args).to(args.device)
    #optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs)
    follow_batch = [] # e.g., follow_batch = ['edge_index']

    dataloaders = {split: DataLoader(
            ds, collate_fn=Batch.collate(follow_batch), 
            batch_size=args.batch_size, shuffle=(split=='train'))
            for split, ds in datasets.items()}
    print('Graphs after split: ')
    for key, dataloader in dataloaders.items():
        for batch in dataloader:
            print(key, ': ', batch)

    train(model, dataloaders, optimizer, args, scheduler=scheduler)
Beispiel #11
0
def create_dataset():
    ## Load dataset
    time1 = time.time()
    if cfg.dataset.format == 'OGB':
        graphs, splits = load_dataset()
    else:
        graphs = load_dataset()

    ## Filter graphs
    time2 = time.time()
    min_node = filter_graphs()

    ## Create whole dataset
    if type(graphs) is GraphDataset:
        dataset = graphs
    else:
        dataset = GraphDataset(
            graphs,
            task=cfg.dataset.task,
            edge_train_mode=cfg.dataset.edge_train_mode,
            edge_message_ratio=cfg.dataset.edge_message_ratio,
            edge_negative_sampling_ratio=cfg.dataset.
            edge_negative_sampling_ratio,
            resample_disjoint=cfg.dataset.resample_disjoint,
            minimum_node_per_graph=min_node)

    ## Transform the whole dataset
    dataset = transform_before_split(dataset)

    ## Split dataset
    time3 = time.time()
    # Use custom data splits
    if cfg.dataset.format == 'OGB':
        datasets = []
        datasets.append(dataset[splits['train']])
        datasets.append(dataset[splits['valid']])
        datasets.append(dataset[splits['test']])
    # Use random split, supported by DeepSNAP
    else:
        datasets = dataset.split(transductive=cfg.dataset.transductive,
                                 split_ratio=cfg.dataset.split)
    # We only change the training negative sampling ratio
    for i in range(1, len(datasets)):
        dataset.edge_negative_sampling_ratio = 1

    ## Transform each split dataset
    time4 = time.time()
    datasets = transform_after_split(datasets)

    time5 = time.time()
    logging.info('Load: {:.4}s, Before split: {:.4}s, '
                 'Split: {:.4}s, After split: {:.4}s'.format(
                     time2 - time1, time3 - time2, time4 - time3,
                     time5 - time4))

    return datasets
Beispiel #12
0
 def test_dataset_basic(self):
     G, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = \
         simple_networkx_graph()
     Graph.add_edge_attr(G, "edge_feature", edge_x)
     Graph.add_edge_attr(G, "edge_label", edge_y)
     Graph.add_node_attr(G, "node_feature", x)
     Graph.add_node_attr(G, "node_label", y)
     Graph.add_graph_attr(G, "graph_feature", graph_x)
     Graph.add_graph_attr(G, "graph_label", graph_y)
     H = deepcopy(G)
     graphs = GraphDataset.list_to_graphs([G, H])
     dataset = GraphDataset(graphs)
     self.assertEqual(len(dataset), 2)
Beispiel #13
0
def deepsnap_pagerank(args, pyg_dataset):
    avg_time = 0
    task = 'graph'
    for i in range(args.num_runs):
        if args.print_run:
            print("Run {}".format(i + 1))
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset,
                                            verbose=True,
                                            fixed_split=False,
                                            netlib=netlib)
        dataset = GraphDataset(graphs, task=task)
        s = time.time()
        dataset.apply_transform(page_fun, update_tensor=False, lib=args.netlib)
        avg_time += (time.time() - s)
    print("DeepSNAP has average time: {}".format(avg_time / args.num_runs))
Beispiel #14
0
def load_dataset():
    '''
    load raw datasets.
    :return: a list of networkx/deepsnap graphs, plus additional info if needed
    '''
    format = cfg.dataset.format
    name = cfg.dataset.name
    # dataset_dir = '{}/{}'.format(cfg.dataset.dir, name)
    dataset_dir = cfg.dataset.dir
    # Try to load customized data format
    for func in register.loader_dict.values():
        graphs = func(format, name, dataset_dir)
        if graphs is not None:
            return graphs
    # Load from Pytorch Geometric dataset
    if format == 'PyG':
        graphs = load_pyg(name, dataset_dir)
    # Load from networkx formatted data
    # todo: clean nx dataloader
    elif format == 'nx':
        graphs = load_nx(name, dataset_dir)
    # Load from OGB formatted data
    elif cfg.dataset.format == 'OGB':
        if cfg.dataset.name == 'ogbg-molhiv':
            dataset = PygGraphPropPredDataset(name=cfg.dataset.name)
            graphs = GraphDataset.pyg_to_graphs(dataset)
        # Note this is only used for custom splits from OGB
        split_idx = dataset.get_idx_split()
        return graphs, split_idx
    else:
        raise ValueError('Unknown data format: {}'.format(cfg.dataset.format))
    return graphs
Beispiel #15
0
    def test_ensemble_generator(self):
        pyg_dataset = Planetoid("./cora", "Cora")
        dg = Graph.pyg_to_graph(pyg_dataset[0])

        num_nodes = 500
        sizes = [2, 3]

        class NeighborGenerator1(Generator):
            def __len__(self):
                return sizes

            def generate(self):
                graph = Graph(gen_graph(num_nodes, dg.G))
                return graph

        class NeighborGenerator2(Generator):
            def __len__(self):
                return sizes

            def generate(self):
                graph = Graph(gen_graph(num_nodes, dg.G))
                return graph

        ensemble_generator = (
            EnsembleGenerator(
                [
                    NeighborGenerator1(sizes),
                    NeighborGenerator2(sizes),
                ]
            )
        )
        dataset = GraphDataset(None, generator=ensemble_generator)
        self.assertTrue(dataset[0].node_feature.shape[0] == num_nodes)
Beispiel #16
0
def batch_nx_graphs(graphs, anchors=None):
    #motifs_batch = [pyg_utils.from_networkx(
    #    nx.convert_node_labels_to_integers(graph)) for graph in graphs]
    #loader = DataLoader(motifs_batch, batch_size=len(motifs_batch))
    #for b in loader: batch = b
    augmenter = feature_preprocess.FeatureAugment()

    if anchors is not None:
        for anchor, g in zip(anchors, graphs):
            for v in g.nodes:
                g.nodes[v]["node_feature"] = torch.tensor([float(v == anchor)])
    if 'aifb' == 'aifb' or 'wn18' == 'wn18':
        # 90 edge types
        for g in graphs:
            for e in g.edges:
                # tmp = torch.zeros(90)
                # tmp[g.edges[e]['edge_type']] = 1.

                g.edges[e]["edge_feature"] = torch.tensor(
                    [g.edges[e]['edge_type']], dtype=torch.long)

    batch = Batch.from_data_list(GraphDataset.list_to_graphs(graphs))
    batch = augmenter.augment(batch)
    batch = batch.to(get_device())
    return batch
Beispiel #17
0
def load_dataset(name):
    task = "graph"
    if name == "enzymes":
        dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
    elif name == "cox2":
        dataset = TUDataset(root="/tmp/cox2", name="COX2")
    elif name == "imdb-binary":
        dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY")

    if task == "graph":
        dataset = GraphDataset(GraphDataset.pyg_to_graphs(dataset))
        dataset = dataset.apply_transform(
            lambda g: g.G.subgraph(max(nx.connected_components(g.G), key=len)))
        dataset = dataset.filter(lambda g: len(g.G) >= 6)
        train, test = dataset.split(split_ratio=[0.8, 0.2])
    return train, test, task
Beispiel #18
0
def load_dataset_example(format, name, dataset_dir):
    dataset_dir = '{}/{}'.format(dataset_dir, name)
    if format == 'PyG':
        if name == 'QM7b':
            dataset_raw = QM7b(dataset_dir)
            graphs = GraphDataset.pyg_to_graphs(dataset_raw)
            return graphs
Beispiel #19
0
    def test_filter(self):
        pyg_dataset = TUDataset('./enzymes', 'ENZYMES')
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
        dataset = GraphDataset(graphs, task="graph")
        thresh = 90

        orig_dataset_size = len(dataset)
        num_graphs_large = 0
        for graph in dataset:
            if len(graph.G) >= thresh:
                num_graphs_large += 1

        dataset = dataset.filter(
            lambda graph: len(graph.G) < thresh, deep_copy=False)
        filtered_dataset_size = len(dataset)

        self.assertEqual(
            orig_dataset_size - filtered_dataset_size, num_graphs_large)
    def test_filter(self):
        pyg_dataset = TUDataset("./enzymes", "ENZYMES")
        ds = pyg_to_dicts(pyg_dataset)
        graphs = [Graph(**item) for item in ds]
        dataset = GraphDataset(graphs, task="graph")
        thresh = 90

        orig_dataset_size = len(dataset)
        num_graphs_large = 0
        for graph in dataset:
            if graph.num_nodes >= thresh:
                num_graphs_large += 1

        dataset = dataset.filter(lambda graph: graph.num_nodes < thresh,
                                 deep_copy=False)
        filtered_dataset_size = len(dataset)

        self.assertEqual(
            orig_dataset_size - filtered_dataset_size,
            num_graphs_large,
        )
Beispiel #21
0
def load_pyg(name, dataset_dir):
    '''
    load pyg format dataset
    :param name: dataset name
    :param dataset_dir: data directory
    :return: a list of networkx/deepsnap graphs
    '''
    dataset_dir = '{}/{}'.format(dataset_dir, name)
    if name in ['Cora', 'CiteSeer', 'PubMed']:
        dataset_raw = Planetoid(dataset_dir, name)
    elif name[:3] == 'TU_':
        # TU_IMDB doesn't have node features
        if name[3:] == 'IMDB':
            name = 'IMDB-MULTI'
            dataset_raw = TUDataset(dataset_dir, name, transform=T.Constant())
        else:
            dataset_raw = TUDataset(dataset_dir, name[3:])
        # TU_dataset only has graph-level label
        # The goal is to have synthetic tasks
        # that select smallest 100 graphs that have more than 200 edges
        if cfg.dataset.tu_simple and cfg.dataset.task != 'graph':
            size = []
            for data in dataset_raw:
                edge_num = data.edge_index.shape[1]
                edge_num = 9999 if edge_num < 200 else edge_num
                size.append(edge_num)
            size = torch.tensor(size)
            order = torch.argsort(size)[:100]
            dataset_raw = dataset_raw[order]
    elif name == 'Karate':
        dataset_raw = KarateClub()
    elif 'Coauthor' in name:
        if 'CS' in name:
            dataset_raw = Coauthor(dataset_dir, name='CS')
        else:
            dataset_raw = Coauthor(dataset_dir, name='Physics')
    elif 'Amazon' in name:
        if 'Computers' in name:
            dataset_raw = Amazon(dataset_dir, name='Computers')
        else:
            dataset_raw = Amazon(dataset_dir, name='Photo')
    elif name == 'MNIST':
        dataset_raw = MNISTSuperpixels(dataset_dir)
    elif name == 'PPI':
        dataset_raw = PPI(dataset_dir)
    elif name == 'QM7b':
        dataset_raw = QM7b(dataset_dir)
    else:
        raise ValueError('{} not support'.format(name))
    graphs = GraphDataset.pyg_to_graphs(dataset_raw)
    return graphs
 def gen_data_loaders(self,
                      size,
                      batch_size,
                      train=True,
                      use_distributed_sampling=False):
     loaders = []
     for i in range(2):
         neighs = []
         for j in range(size // 2):
             graph, neigh = utils.sample_neigh(
                 self.train_set if train else self.test_set,
                 random.randint(self.min_size, self.max_size))
             neighs.append(graph.subgraph(neigh))
         dataset = GraphDataset(GraphDataset.list_to_graphs(neighs))
         loaders.append(
             TorchDataLoader(dataset,
                             collate_fn=Batch.collate([]),
                             batch_size=batch_size //
                             2 if i == 0 else batch_size // 2,
                             sampler=None,
                             shuffle=False))
     loaders.append([None] * (size // batch_size))
     return loaders
Beispiel #23
0
    def test_pyg_to_graphs_global(self):
        import deepsnap
        deepsnap.use(nx)

        pyg_dataset = Planetoid('./planetoid', "Cora")
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
        self.assertTrue(isinstance(graphs[0].G, nx.Graph))
        dataset = GraphDataset(graphs, task='node')
        num_nodes = dataset.num_nodes[0]
        node_0 = int(0.8 * num_nodes)
        node_1 = int(0.1 * num_nodes)
        node_2 = num_nodes - node_0 - node_1
        train, val, test = dataset.split()
        self.assertTrue(isinstance(train[0].G, nx.Graph))
        self.assertTrue(isinstance(val[0].G, nx.Graph))
        self.assertTrue(isinstance(test[0].G, nx.Graph))
        self.assertEqual(train[0].node_label_index.shape[0], node_0)
        self.assertEqual(val[0].node_label_index.shape[0], node_1)
        self.assertEqual(test[0].node_label_index.shape[0], node_2)

        train_loader = DataLoader(train,
                                  collate_fn=Batch.collate(),
                                  batch_size=1)
        for batch in train_loader:
            self.assertTrue(isinstance(batch.G[0], nx.Graph))

        deepsnap.use(sx)
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
        self.assertTrue(isinstance(graphs[0].G, sx.Graph))
        dataset = GraphDataset(graphs, task='node')
        num_nodes = dataset.num_nodes[0]
        node_0 = int(0.8 * num_nodes)
        node_1 = int(0.1 * num_nodes)
        node_2 = num_nodes - node_0 - node_1
        train, val, test = dataset.split()
        self.assertTrue(isinstance(train[0].G, sx.Graph))
        self.assertTrue(isinstance(val[0].G, sx.classes.graph.Graph))
        self.assertTrue(isinstance(test[0].G, sx.classes.graph.Graph))
        self.assertEqual(train[0].node_label_index.shape[0], node_0)
        self.assertEqual(val[0].node_label_index.shape[0], node_1)
        self.assertEqual(test[0].node_label_index.shape[0], node_2)

        train_loader = DataLoader(train,
                                  collate_fn=Batch.collate(),
                                  batch_size=1)
        for batch in train_loader:
            self.assertTrue(isinstance(batch.G[0], sx.Graph))
Beispiel #24
0
 def test_batch_basic(self):
     G, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = \
         simple_networkx_graph()
     Graph.add_edge_attr(G, "edge_feature", edge_x)
     Graph.add_edge_attr(G, "edge_label", edge_y)
     Graph.add_node_attr(G, "node_feature", x)
     Graph.add_node_attr(G, "node_label", y)
     Graph.add_graph_attr(G, "graph_feature", graph_x)
     Graph.add_graph_attr(G, "graph_label", graph_y)
     H = deepcopy(G)
     graphs = GraphDataset.list_to_graphs([G, H])
     batch = Batch.from_data_list(graphs)
     self.assertEqual(batch.num_graphs, 2)
     self.assertEqual(len(batch.node_feature),
                      2 * len(graphs[0].node_feature))
Beispiel #25
0
def load_dataset(name):
    def add_feats(graph):
        for v in graph.G.nodes:
            graph.G.nodes[v]["node_feature"] = torch.ones(1)
        return graph

    task = "graph"
    if name == "enzymes":
        dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
    elif name == "cox2":
        dataset = TUDataset(root="/tmp/cox2", name="COX2")
    elif name == "imdb-binary":
        dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY")

    if task == "graph":
        dataset = GraphDataset(GraphDataset.pyg_to_graphs(dataset))
        # add blank features for imdb-binary, which doesn't have node labels
        if name == "imdb-binary":
            dataset = dataset.apply_transform(add_feats)
        dataset = dataset.apply_transform(
            lambda g: g.G.subgraph(max(nx.connected_components(g.G), key=len)))
        dataset = dataset.filter(lambda g: len(g.G) >= 6)
        train, test = dataset.split(split_ratio=[0.8, 0.2])
    return train, test, task
Beispiel #26
0
def batch_nx_graphs_multi(graphs, anchors=None):
    # motifs_batch = [pyg_utils.from_networkx(
    #    nx.convert_node_labels_to_integers(graph)) for graph in graphs]
    # loader = DataLoader(motifs_batch, batch_size=len(motifs_batch))
    # for b in loader: batch = b
    augmenter = feature_preprocess.FeatureAugment()

    if anchors is not None:
        for anchor, g in zip(anchors, graphs):
            for v in g.nodes:
                g.nodes[v]["node_feature"] = torch.tensor([float(v == anchor)])

    batch = Batch.from_data_list(GraphDataset.list_to_graphs(graphs))
    batch = augmenter.augment(batch)
    batch = batch.to(get_device())
    return batch
    def test_dataset_basic(self):
        _, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = (
            simple_networkx_graph())

        G = Graph(node_feature=x,
                  node_label=y,
                  edge_index=edge_index,
                  edge_feature=edge_x,
                  edge_label=edge_y,
                  graph_feature=graph_x,
                  graph_label=graph_y,
                  directed=True)

        H = deepcopy(G)

        dataset = GraphDataset([G, H])
        self.assertEqual(len(dataset), 2)
Beispiel #28
0
    def test_generator(self):
        pyg_dataset = Planetoid('./cora', 'Cora')
        dg = Graph.pyg_to_graph(pyg_dataset[0])

        num_nodes = 500
        sizes = [2, 3]

        class NeighborGenerator(Generator):
            def __len__(self):
                return sizes

            def generate(self):
                graph = Graph(gen_graph(num_nodes, dg.G))
                return graph

        dataset = GraphDataset(None, generator=NeighborGenerator(sizes))
        self.assertTrue(dataset[0].node_feature.shape[0] == num_nodes)
Beispiel #29
0
def main():
    args = arg_parse()

    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    WN_graph = nx.read_gpickle(args.data_path)
    print('Each node has node ID (n_id). Example: ', WN_graph.nodes[0])
    print(
        'Each edge has edge ID (id) and categorical label (e_label). Example: ',
        WN_graph[0][5871])

    # Since both feature and label are relation types,
    # Only the disjoint mode would make sense
    dataset = GraphDataset(
        [WN_graph],
        task='link_pred',
        edge_train_mode=edge_train_mode,
        edge_message_ratio=args.edge_message_ratio,
        edge_negative_sampling_ratio=args.neg_sampling_ratio)

    # find num edge types
    max_label = 0
    labels = []
    for u, v, edge_key in WN_graph.edges:
        l = WN_graph[u][v][edge_key]['e_label']
        if not l in labels:
            labels.append(l)
    # labels are consecutive (0-17)
    num_edge_types = len(labels)

    print('Pre-transform: ', dataset[0])
    dataset = dataset.apply_transform(WN_transform,
                                      num_edge_types=num_edge_types,
                                      deep_copy=False)
    print('Post-transform: ', dataset[0])
    print('Initial data: {} nodes; {} edges.'.format(
        dataset[0].G.number_of_nodes(), dataset[0].G.number_of_edges()))
    print('Number of node features: {}'.format(dataset.num_node_features))

    # split dataset
    datasets = {}
    datasets['train'], datasets['val'], datasets['test'] = dataset.split(
        transductive=True, split_ratio=[0.8, 0.1, 0.1])

    print('After split:')
    print('Train message-passing graph: {} nodes; {} edges.'.format(
        datasets['train'][0].G.number_of_nodes(),
        datasets['train'][0].G.number_of_edges()))
    print('Val message-passing graph: {} nodes; {} edges.'.format(
        datasets['val'][0].G.number_of_nodes(),
        datasets['val'][0].G.number_of_edges()))
    print('Test message-passing graph: {} nodes; {} edges.'.format(
        datasets['test'][0].G.number_of_nodes(),
        datasets['test'][0].G.number_of_edges()))

    # node feature dimension
    input_dim = datasets['train'].num_node_features
    edge_feat_dim = datasets['train'].num_edge_features
    num_classes = datasets['train'].num_edge_labels
    print(
        'Node feature dim: {}; edge feature dim: {}; num classes: {}.'.format(
            input_dim, edge_feat_dim, num_classes))

    # relation type is both used for edge features and edge labels
    model = Net(input_dim, edge_feat_dim, num_classes, args).to(args.device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.001,
                                 weight_decay=5e-3)
    follow_batch = []  # e.g., follow_batch = ['edge_index']

    dataloaders = {
        split: DataLoader(ds,
                          collate_fn=Batch.collate(follow_batch),
                          batch_size=1,
                          shuffle=(split == 'train'))
        for split, ds in datasets.items()
    }
    print('Graphs after split: ')
    for key, dataloader in dataloaders.items():
        for batch in dataloader:
            print(key, ': ', batch)

    train(model, dataloaders, optimizer, args)
Beispiel #30
0
        raise ValueError("Unsupported dataset.")

    if args.netlib == "nx":
        import networkx as netlib
        print("Use NetworkX as the backend network library.")
    elif args.netlib == "sx":
        import snap
        import snapx as netlib
        print("Use SnapX as the backend network library.")
    else:
        raise ValueError("{} network library is not supported.".format(
            args.netlib))

    if args.split == 'random':
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset,
                                            verbose=True,
                                            fixed_split=False,
                                            netlib=netlib)
        dataset = GraphDataset(graphs,
                               task='node')  # node, edge, link_pred, graph
        dataset_train, dataset_val, dataset_test = dataset.split(
            transductive=True,
            split_ratio=[0.8, 0.1, 0.1])  # transductive split, inductive split
    else:
        graphs_train, graphs_val, graphs_test = \
            GraphDataset.pyg_to_graphs(pyg_dataset, verbose=True,
                    fixed_split=True, netlib=netlib)

        dataset_train, dataset_val, dataset_test = \
            GraphDataset(graphs_train, task='node'), GraphDataset(graphs_val,task='node'), \
            GraphDataset(graphs_test, task='node')