Ejemplo n.º 1
0
def sub_data_maker(data_name):

    name = 'Sub_{}'.format(data_name)
    path = osp.join(osp.dirname(osp.realpath(__file__)), 'data', data_name)
    if data_name == "Flickr":
        dataset = Flickr(path, data_name)
    else:
        dataset = Yelp(path, data_name)

    print(data_name)
    start = time.perf_counter()
    f_data = []
    for i in range(0, 80000, 100):
        adj = k_hop_subgraph(i, 1, dataset.data.edge_index,
                             relabel_nodes=True)[1]
        index = k_hop_subgraph(i,
                               1,
                               dataset.data.edge_index,
                               relabel_nodes=True)[0].numpy()
        feature = dataset.data.x[index]
        label = dataset.data.y[index]
        data = Data(x=feature, edge_index=adj, y=label)
        f_data.append(data)

    os.makedirs('./data/{}/processed'.format(name), exist_ok=True)
    torch.save(f_data, './data/{}/processed/data.pt'.format(name))

    end = time.perf_counter()
    print("time consuming {:.2f}".format(end - start))

    print(f_data[:10])
Ejemplo n.º 2
0
def load_dataset(dataset='flickr'):
    """

    Args:
        dataset: str, name of dataset, assuming the raw dataset path is ./data/your_dataset/raw.
                 torch_geometric.dataset will automatically preprocess the raw files and save preprocess dataset into
                 ./data/your_dataset/preprocess

    Returns:
        dataset
    """
    path = osp.join(osp.dirname(osp.realpath(__file__)), 'data', dataset)
    if dataset == 'flickr':
        dataset = Flickr(path)

    elif dataset == 'reddit':
        dataset = Reddit(path)

    elif dataset == 'ppi':
        dataset = PPI(path)

    elif dataset == 'ppi-large':
        dataset = PPI(path)

    elif dataset == 'yelp':
        dataset = Yelp(path)

    else:
        raise KeyError('Dataset name error')

    return dataset
Ejemplo n.º 3
0
    def __init__(self, path: str):
        pyg_dataset = Flickr(os.path.join(path, '_pyg'))
        if hasattr(pyg_dataset, "__data_list__"):
            delattr(pyg_dataset, "__data_list__")
        if hasattr(pyg_dataset, "_data_list"):
            delattr(pyg_dataset, "_data_list")
        pyg_data = pyg_dataset[0]

        static_graph = GeneralStaticGraphGenerator.create_homogeneous_static_graph(
            {
                'x': pyg_data.x,
                'y': pyg_data.y,
                'train_mask': getattr(pyg_data, 'train_mask'),
                'val_mask': getattr(pyg_data, 'val_mask'),
                'test_mask': getattr(pyg_data, 'test_mask')
            }, pyg_data.edge_index)
        super(FlickrDataset, self).__init__([static_graph])
Ejemplo n.º 4
0
def get_dataset(dataset_name):
    """
    Retrieves the dataset corresponding to the given name.
    """
    path = join('dataset', dataset_name)
    if dataset_name == 'reddit':
        dataset = Reddit(path)
    elif dataset_name == 'flickr':
        dataset = Flickr(path)
    elif dataset_name == 'zinc':
        dataset = ZINC(root='dataset', subset=True, split='train')
    elif dataset_name == 'QM9':
        dataset = QM9(root='dataset')
    elif dataset_name == 'github':
        dataset = GitHub(path)
    elif dataset_name == 'ppi':
        dataset = PPI(path)
    elif dataset_name in ['amazon_comp', 'amazon_photo']:
        dataset = Amazon(path, "Computers", T.NormalizeFeatures()
                         ) if dataset_name == 'amazon_comp' else Amazon(
                             path, "Photo", T.NormalizeFeatures())
        data = dataset.data
        idx_train, idx_test = train_test_split(list(range(data.x.shape[0])),
                                               test_size=0.4,
                                               random_state=42)
        idx_val, idx_test = train_test_split(idx_test,
                                             test_size=0.5,
                                             random_state=42)
        data.train_mask = torch.tensor(idx_train)
        data.val_mask = torch.tensor(idx_val)
        data.test_mask = torch.tensor(idx_test)
        dataset.data = data
    elif dataset_name in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(path,
                            name=dataset_name,
                            split="public",
                            transform=T.NormalizeFeatures())
    else:
        raise NotImplementedError

    return dataset
Ejemplo n.º 5
0
parser.add_argument('--ratio_graph', type=int, default=90)
parser.add_argument("--draw", type=int, default=100)
parser.add_argument('--use_gdc', type=bool, default=False)
parser.add_argument('--save_file', type=str, default="model.pth.tar")
parser.add_argument('--lookback', type=int, default=3)
parser.add_argument("--thres", type=float, default=0.0)
parser.add_argument("--dataset", type=str, default="CiteSeer")
parser.add_argument("--log", type=str, default="{:05d}")
args = parser.parse_args()

dataset = args.dataset
logging.basicConfig(filename=f"test_{dataset}_mask_change.txt",
                    level=logging.DEBUG)
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
if dataset == "F":
    dataset = Flickr(path, transform=T.NormalizeFeatures())
    print(len(dataset))
else:
    dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
# print(f"Number of graphs in {dataset} dataset:", len(dataset))
data = dataset[0]
model, data = Net(dataset, data, args).to(device), data.to(device)
checkpoint = torch.load(f"./pretrain_pytorch/{args.dataset}_model.pth.tar")
model.load_state_dict(checkpoint)

loss = lambda m: F.nll_loss(m()[data.train_mask], data.y[data.train_mask])
# print("construct admm training")
support1 = model.adj1  # sparse
support2 = model.adj2  # sparse
partial_adj_mask = support1.clone()
adj_variables = [support1, support2]
Ejemplo n.º 6
0
import os.path as osp

import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch.utils.data import DataLoader
from torch_geometric.datasets import Flickr
import torch_geometric.transforms as T

K = 2
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Flickr')
transform = T.Compose([T.NormalizeFeatures(), T.SIGN(K)])
dataset = Flickr(path, transform=transform)
data = dataset[0]

train_idx = data.train_mask.nonzero(as_tuple=False).view(-1)
val_idx = data.val_mask.nonzero(as_tuple=False).view(-1)
test_idx = data.test_mask.nonzero(as_tuple=False).view(-1)

train_loader = DataLoader(train_idx, batch_size=16 * 1024, shuffle=True)
val_loader = DataLoader(val_idx, batch_size=32 * 1024)
test_loader = DataLoader(test_idx, batch_size=32 * 1024)


class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.lins = torch.nn.ModuleList()
        for _ in range(K + 1):
            self.lins.append(Linear(dataset.num_node_features, 1024))
Ejemplo n.º 7
0
import os.path as osp

import argparse
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Flickr
from torch_geometric.loader import GraphSAINTRandomWalkSampler
from torch_geometric.nn import GraphConv
from torch_geometric.utils import degree

path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Flickr')
dataset = Flickr(path)
data = dataset[0]
row, col = data.edge_index
data.edge_weight = 1. / degree(col, data.num_nodes)[col]  # Norm by in-degree.

parser = argparse.ArgumentParser()
parser.add_argument('--use_normalization', action='store_true')
args = parser.parse_args()

loader = GraphSAINTRandomWalkSampler(data,
                                     batch_size=6000,
                                     walk_length=2,
                                     num_steps=5,
                                     sample_coverage=100,
                                     save_dir=dataset.processed_dir,
                                     num_workers=4)


class Net(torch.nn.Module):
    def __init__(self, hidden_channels):
Ejemplo n.º 8
0
 def __init__(self, path):
     Flickr(path)
     super(FlickrDataset, self).__init__(path)
Ejemplo n.º 9
0
def load_dataset(device, args):
    """
    Load dataset and move graph and features to device
    """
    if args.dataset in [
            "reddit", "cora", "ppi", "ppi_large", "yelp", "flickr"
    ]:
        # raise RuntimeError("Dataset {} is not supported".format(name))
        if args.dataset == "reddit":
            from dgl.data import RedditDataset
            data = RedditDataset(self_loop=True)
            g = data[0]
            g = dgl.add_self_loop(g)
            n_classes = data.num_classes
        elif args.dataset == "cora":
            from dgl.data import CitationGraphDataset
            data = CitationGraphDataset('cora',
                                        raw_dir=os.path.join(
                                            args.data_dir, 'cora'))
            g = data[0]
            g = dgl.remove_self_loop(g)
            g = dgl.add_self_loop(g)
            n_classes = data.num_classes
        elif args.dataset == "ppi":
            data = load_ppi_data(args.data_dir)
            g = data.g
            n_classes = data.num_classes
        elif args.dataset == "ppi_large":
            data = load_ppi_large_data()
            g = data.g
            n_classes = data.num_classes
        elif args.dataset == "yelp":
            from torch_geometric.datasets import Yelp
            pyg_data = Yelp(os.path.join(args.data_dir, 'yelp'))[0]
            feat = pyg_data.x
            labels = pyg_data.y
            u, v = pyg_data.edge_index
            g = dgl.graph((u, v))
            g.ndata['feat'] = feat
            g.ndata['label'] = labels
            g.ndata['train_mask'] = pyg_data.train_mask
            g.ndata['val_mask'] = pyg_data.val_mask
            g.ndata['test_mask'] = pyg_data.test_mask
            n_classes = labels.size(1)
        elif args.dataset == "flickr":
            from torch_geometric.datasets import Flickr
            pyg_data = Flickr(os.path.join(args.data_dir, "flickr"))[0]
            feat = pyg_data.x
            labels = pyg_data.y
            # labels = torch.argmax(labels, dim=1)
            u, v = pyg_data.edge_index
            g = dgl.graph((u, v))
            g.ndata['feat'] = feat
            g.ndata['label'] = labels
            g.ndata['train_mask'] = pyg_data.train_mask
            g.ndata['val_mask'] = pyg_data.val_mask
            g.ndata['test_mask'] = pyg_data.test_mask
            n_classes = labels.max().item() + 1

        train_mask = g.ndata['train_mask']
        val_mask = g.ndata['val_mask']
        test_mask = g.ndata['test_mask']
        train_nid = train_mask.nonzero().squeeze().long()
        val_nid = val_mask.nonzero().squeeze().long()
        test_nid = test_mask.nonzero().squeeze().long()
        g = g.to(device)
        labels = g.ndata['label']

    else:
        dataset = DglNodePropPredDataset(name=args.dataset, root=args.data_dir)
        splitted_idx = dataset.get_idx_split()
        train_nid = splitted_idx["train"]
        val_nid = splitted_idx["valid"]
        test_nid = splitted_idx["test"]
        g, labels = dataset[0]
        n_classes = dataset.num_classes
        g = g.to(device)

        if args.dataset == "ogbn-arxiv":
            g = dgl.add_reverse_edges(g, copy_ndata=True)
            g = dgl.add_self_loop(g)
            g.ndata['feat'] = g.ndata['feat'].float()

        elif args.dataset == "ogbn-papers100M":
            g = dgl.add_reverse_edges(g, copy_ndata=True)
            g.ndata['feat'] = g.ndata['feat'].float()
            labels = labels.long()

        elif args.dataset == "ogbn-mag":
            # MAG is a heterogeneous graph. The task is to make prediction for
            # paper nodes
            path = os.path.join(args.emb_path, f"{args.pretrain_model}_mag")
            labels = labels["paper"]
            train_nid = train_nid["paper"]
            val_nid = val_nid["paper"]
            test_nid = test_nid["paper"]
            features = g.nodes['paper'].data['feat']
            author_emb = torch.load(os.path.join(path, "author.pt"),
                                    map_location=torch.device("cpu")).float()
            topic_emb = torch.load(os.path.join(path, "field_of_study.pt"),
                                   map_location=torch.device("cpu")).float()
            institution_emb = torch.load(
                os.path.join(path, "institution.pt"),
                map_location=torch.device("cpu")).float()

            g.nodes["author"].data["feat"] = author_emb.to(device)
            g.nodes["institution"].data["feat"] = institution_emb.to(device)
            g.nodes["field_of_study"].data["feat"] = topic_emb.to(device)
            g.nodes["paper"].data["feat"] = features.to(device)
            paper_dim = g.nodes["paper"].data["feat"].shape[1]
            author_dim = g.nodes["author"].data["feat"].shape[1]
            if paper_dim != author_dim:
                paper_feat = g.nodes["paper"].data.pop("feat")
                rand_weight = torch.Tensor(paper_dim,
                                           author_dim).uniform_(-0.5, 0.5)
                g.nodes["paper"].data["feat"] = torch.matmul(
                    paper_feat, rand_weight.to(device))
                print(
                    f"Randomly project paper feature from dimension {paper_dim} to {author_dim}"
                )

            labels = labels.to(device).squeeze()
            n_classes = int(labels.max() - labels.min()) + 1

        else:
            g.ndata['feat'] = g.ndata['feat'].float()

        labels = labels.squeeze()

    evaluator = get_evaluator(args.dataset)

    print(f"# Nodes: {g.number_of_nodes()}\n"
          f"# Edges: {g.number_of_edges()}\n"
          f"# Train: {len(train_nid)}\n"
          f"# Val: {len(val_nid)}\n"
          f"# Test: {len(test_nid)}\n"
          f"# Classes: {n_classes}")

    return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
Ejemplo n.º 10
0
elif DATASET == "Reddit":
    real_data = Reddit(root=input_path)
elif DATASET == "Amazon Computers":
    real_data = Amazon(root=input_path, name="Computers")
elif DATASET == "Amazon Photos":
    real_data = Amazon(root=input_path, name="Photo")
elif DATASET == "CLUSTER":
    real_data = GNNBenchmarkDataset(root=input_path,
                                    name="CLUSTER",
                                    split="test")
elif DATASET == "PATTERN":
    real_data = GNNBenchmarkDataset(root=input_path,
                                    name="PATTERN",
                                    split="test")
elif DATASET == "Flickr":
    real_data = Flickr(root=input_path)
elif DATASET == "OGB Products":
    real_data = PygNodePropPredDataset(name='ogbn-products')
    split_idx = real_data.get_idx_split()
elif DATASET == "GitHub Network":
    gitGraph = from_networkx(load_graph(input_path + '/musae_git_edges.csv'))
    gitGraph.x = torch.tensor(
        load_features(input_path + '/musae_git_features.json'))
    gitGraph.y = torch.tensor(
        load_targets(input_path + '/musae_git_target.csv'))
elif DATASET == "SBM":

    # Size of blocks
    COMMUNITY_SIZE = 400

    # Number of clusters
Ejemplo n.º 11
0
 def prepare_data(self):
     path = osp.join(osp.dirname(osp.realpath(__file__)), "..", "..",
                     "data", self.NAME)
     self.dataset = Flickr(path)
     self.data = self.dataset[0]
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', type=str, default='0')
    parser.add_argument('--model', type=str,
                        default='indGCN')  # indGCN, GraphSAGE
    parser.add_argument('--dataset', type=str,
                        default='Reddit')  # Reddit; Flickr
    parser.add_argument('--batch', type=int, default=512)  # 512; 1024
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--hidden', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--binarize', action='store_true')
    args = parser.parse_args()
    print(args)

    assert args.dataset in [
        'Flickr', 'Reddit'
    ], 'For dataset, only Flickr and Reddit are available'

    path = '/home/wangjunfu/dataset/graph/' + str(args.dataset)
    if args.dataset == 'Flickr':
        dataset = Flickr(path)
    else:
        dataset = Reddit(path)

    data = dataset[0]

    train_loader = NeighborSampler(data.edge_index,
                                   node_idx=data.train_mask,
                                   sizes=[25, 10],
                                   batch_size=args.batch,
                                   shuffle=True,
                                   num_workers=12)
    subgraph_loader = NeighborSampler(data.edge_index,
                                      node_idx=None,
                                      sizes=[-1],
                                      batch_size=args.batch,
                                      shuffle=False,
                                      num_workers=12)

    device = torch.device(
        f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu')
    assert args.model in ['indGCN', 'GraphSAGE'
                          ], 'Only indGCN and GraphSAGE are available.'
    model = NeighborSamplingGCN(args.model, dataset.num_features, args.hidden,
                                dataset.num_classes, args.binarize,
                                args.dropout).to(device)

    test_accs = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        best_test = 0.0
        best_val = 0.0
        for epoch in range(1, args.epochs + 1):
            loss, acc = train(model, data, train_loader, optimizer, device)
            train_f1, val_f1, test_f1 = test(model, data, subgraph_loader,
                                             device)
            if val_f1 > best_val:
                best_val = val_f1
                best_test = test_f1

            if args.runs == 1:
                print(
                    "Epoch: {:d}, Loss:{:.4f}, Train f1: {:.4f}, Val f1: {:.4f}, Test f1: {:.4f}"
                    .format(epoch, loss, train_f1, val_f1, test_f1))

        test_accs.append(best_test)
        print("Run: {:d}, best_test: {:.4f}".format(run, best_test))

    test_accs = torch.tensor(test_accs)
    print("Average test f1 score:{:.4f} ± {:.4f}".format(
        test_accs.mean(), test_accs.std()))
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--device", type=str, default='0')
    parser.add_argument('--model', type=str, default='GraphSAINT')
    parser.add_argument('--dataset', type=str,
                        default='Reddit')  # Reddit or Flickr
    parser.add_argument('--batch', type=int,
                        default=2000)  # Reddit:2000, Flickr:6000
    parser.add_argument('--walk_length', type=int,
                        default=4)  # Reddit:4, Flickr:2
    parser.add_argument('--sample_coverage', type=int,
                        default=50)  # Reddit:50, Flickr:100
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--epochs', type=int, default=100)  # 100, 50
    parser.add_argument('--lr', type=float, default=0.01)  # 0.01, 0.001
    parser.add_argument('--weight_decay', type=float, default=0.0005)
    parser.add_argument('--hidden', type=int, default=256)  # 128, 256
    parser.add_argument('--dropout', type=float, default=0.1)  # 0.1, 0.2
    parser.add_argument('--use_normalization', action='store_true')
    parser.add_argument('--binarize', action='store_true')
    args = parser.parse_args()

    assert args.model in ['GraphSAINT']
    assert args.dataset in ['Flickr', 'Reddit']
    path = '/home/wangjunfu/dataset/graph/' + str(args.dataset)
    if args.dataset == 'Flickr':
        dataset = Flickr(path)
    else:
        dataset = Reddit(path)
    data = dataset[0]
    row, col = data.edge_index
    data.edge_weight = 1. / degree(col,
                                   data.num_nodes)[col]  # Norm by in-degree.
    loader = GraphSAINTRandomWalkSampler(data,
                                         batch_size=args.batch,
                                         walk_length=args.walk_length,
                                         num_steps=5,
                                         sample_coverage=args.sample_coverage,
                                         save_dir=dataset.processed_dir,
                                         num_workers=0)

    device = torch.device(
        f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu')
    model = SAINT(data.num_node_features, args.hidden, dataset.num_classes,
                  args.dropout, args.binarize).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    val_f1s, test_f1s = [], []
    for run in range(1, args.runs + 1):
        best_val, best_test = 0, 0
        model.reset_parameters()
        start_time = time.time()
        for epoch in range(1, args.epochs + 1):
            loss = train(model, loader, optimizer, device,
                         args.use_normalization)
            accs = test(model, data, device, args.use_normalization)
            if accs[1] > best_val:
                best_val = accs[1]
                best_test = accs[2]
            if args.runs == 1:
                print(
                    f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {accs[0]:.4f}, '
                    f'Val: {accs[1]:.4f}, Test: {accs[2]:.4f}')
        test_f1s.append(best_test)
        print(
            "Run: {:d}, best val: {:.4f}, best test: {:.4f}, time cost: {:d}s".
            format(run, best_val, best_test, int(time.time() - start_time)))

    test_f1s = torch.tensor(test_f1s)
    print("{:.4f} ± {:.4f}".format(test_f1s.mean(), test_f1s.std()))