def main(): args = arg_parse() edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) G = nx.read_gpickle(args.data_path) print(G.number_of_edges()) print('Each node has node ID (n_id). Example: ', G.nodes[0]) print( 'Each edge has edge ID (id) and categorical label (e_label). Example: ', G[0][5871]) # find num edge types max_label = 0 labels = [] for u, v, edge_key in G.edges: l = G[u][v][edge_key]['e_label'] if not l in labels: labels.append(l) # labels are consecutive (0-17) num_edge_types = len(labels) H = WN_transform(G, num_edge_types) # The nodes in the graph have the features: node_feature and node_type (just one node type "n1" here) for node in H.nodes(data=True): print(node) break # The edges in the graph have the features: edge_feature and edge_type ("0" - "17" here) for edge in H.edges(data=True): print(edge) break hete = HeteroGraph(H) dataset = GraphDataset([hete], task='link_pred') dataset_train, dataset_val, dataset_test = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1]) train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=1) val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=1) test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=1) dataloaders = { 'train': train_loader, 'val': val_loader, 'test': test_loader } hidden_size = 32 model = HeteroNet(hete, hidden_size, 0.2).to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) train(model, dataloaders, optimizer, args)
def run(proc_id, n_gpus, devices): dataset_train = torch.split(dataset_train, len(dataset_train) // n_gpus)[proc_id] train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=16) # basic data loader val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=16) # basic data loader test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=16) # basic data loader dev_id = devices[proc_id] torch.cuda.set_device(dev_id) model = Net().to(dev_id) model.reset_parameters() if n_gpus > 1: model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-3) val_max = -math.inf best_model = model if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12346') world_size = n_gpus torch.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=proc_id) for epoch in range(1, 201): train() log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}' train_acc, val_acc, test_acc = test() print(log.format(epoch, train_acc, val_acc, test_acc)) if val_max < val_acc: val_max = val_acc # best_model = copy.deepcopy(model) # model = best_model log = 'Best, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}' train_acc, val_acc, test_acc = test() print(log.format(train_acc, val_acc, test_acc))
def test_torch_dataloader_collate(self): # graph classification example pyg_dataset = TUDataset('./enzymes', 'ENZYMES') graphs = GraphDataset.pyg_to_graphs(pyg_dataset) dataset = GraphDataset(graphs, task="graph") train_batch_num = math.ceil(len(dataset) * 0.8 / 32) test_batch_num = math.ceil(len(dataset) * 0.1 / 32) val_batch_num = math.ceil(len(dataset) * 0.1 / 32) datasets = {} datasets['train'], datasets['val'], datasets['test'] = \ dataset.split(transductive=False, split_ratio=[0.8, 0.1, 0.1]) dataloaders = { split: DataLoader(dataset, collate_fn=Batch.collate(), batch_size=32, shuffle=True) for split, dataset in datasets.items() } self.assertEqual(len(dataloaders['train']), train_batch_num) self.assertEqual(len(dataloaders['val']), test_batch_num) self.assertEqual(len(dataloaders['test']), val_batch_num) for i, data in enumerate(dataloaders['train']): if i != len(dataloaders['train']) - 1: self.assertEqual(data.num_graphs, 32) for i, data in enumerate(dataloaders['val']): if i != len(dataloaders['val']) - 1: self.assertEqual(data.num_graphs, 32) for i, data in enumerate(dataloaders['test']): if i != len(dataloaders['test']) - 1: self.assertEqual(data.num_graphs, 32)
def test_hetero_graph_batch(self): G = generate_simple_hete_graph() hete = HeteroGraph(G) hete = HeteroGraph( node_feature=hete.node_feature, node_label=hete.node_label, edge_feature=hete.edge_feature, edge_label=hete.edge_label, edge_index=hete.edge_index, directed=True ) heteGraphDataset = [] for _ in range(30): heteGraphDataset.append(hete.clone()) dataloader = DataLoader( heteGraphDataset, collate_fn=Batch.collate(), batch_size=3, shuffle=True, ) self.assertEqual(len(dataloader), math.ceil(30 / 3)) for data in dataloader: self.assertEqual(data.num_graphs, 3)
def gen_data_loaders(self, batch_size, train=True): return [ TorchDataLoader(self.train if train else self.test, collate_fn=Batch.collate([]), batch_size=batch_size // 2, shuffle=True) for i in range(3) ]
def deepsnap_ego(args, pyg_dataset): avg_time = 0 task = "graph" for i in range(args.num_runs): if args.print_run: print("Run {}".format(i + 1)) graphs = GraphDataset.pyg_to_graphs(pyg_dataset, verbose=True, netlib=netlib) dataset = GraphDataset(graphs, task=task) datasets = {} datasets['train'], datasets['val'], datasets['test'] = dataset.split( transductive=False, split_ratio=[0.8, 0.1, 0.1], shuffle=False) dataloaders = { split: DataLoader(dataset, collate_fn=Batch.collate(), batch_size=1, shuffle=False) for split, dataset in datasets.items() } s = time.time() for batch in dataloaders['train']: batch = batch.apply_transform(ego_nets, update_tensor=True) avg_time += (time.time() - s) print("DeepSNAP has average time: {}".format(avg_time / args.num_runs))
def test_pyg_to_graphs_global(self): import deepsnap deepsnap.use(nx) pyg_dataset = Planetoid('./planetoid', "Cora") graphs = GraphDataset.pyg_to_graphs(pyg_dataset) self.assertTrue(isinstance(graphs[0].G, nx.Graph)) dataset = GraphDataset(graphs, task='node') num_nodes = dataset.num_nodes[0] node_0 = int(0.8 * num_nodes) node_1 = int(0.1 * num_nodes) node_2 = num_nodes - node_0 - node_1 train, val, test = dataset.split() self.assertTrue(isinstance(train[0].G, nx.Graph)) self.assertTrue(isinstance(val[0].G, nx.Graph)) self.assertTrue(isinstance(test[0].G, nx.Graph)) self.assertEqual(train[0].node_label_index.shape[0], node_0) self.assertEqual(val[0].node_label_index.shape[0], node_1) self.assertEqual(test[0].node_label_index.shape[0], node_2) train_loader = DataLoader(train, collate_fn=Batch.collate(), batch_size=1) for batch in train_loader: self.assertTrue(isinstance(batch.G[0], nx.Graph)) deepsnap.use(sx) graphs = GraphDataset.pyg_to_graphs(pyg_dataset) self.assertTrue(isinstance(graphs[0].G, sx.Graph)) dataset = GraphDataset(graphs, task='node') num_nodes = dataset.num_nodes[0] node_0 = int(0.8 * num_nodes) node_1 = int(0.1 * num_nodes) node_2 = num_nodes - node_0 - node_1 train, val, test = dataset.split() self.assertTrue(isinstance(train[0].G, sx.Graph)) self.assertTrue(isinstance(val[0].G, sx.classes.graph.Graph)) self.assertTrue(isinstance(test[0].G, sx.classes.graph.Graph)) self.assertEqual(train[0].node_label_index.shape[0], node_0) self.assertEqual(val[0].node_label_index.shape[0], node_1) self.assertEqual(test[0].node_label_index.shape[0], node_2) train_loader = DataLoader(train, collate_fn=Batch.collate(), batch_size=1) for batch in train_loader: self.assertTrue(isinstance(batch.G[0], sx.Graph))
def main(): args = arg_parse() pyg_dataset = Planetoid('./cora', 'Cora', transform=T.TargetIndegree()) # the input that we assume users have edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) graphs = GraphDataset.pyg_to_graphs(pyg_dataset, tensor_backend=True) if args.multigraph: graphs = [copy.deepcopy(graphs[0]) for _ in range(10)] dataset = GraphDataset(graphs, task='link_pred', edge_message_ratio=args.edge_message_ratio, edge_train_mode=edge_train_mode) print('Initial dataset: {}'.format(dataset)) # split dataset datasets = {} datasets['train'], datasets['val'], datasets['test']= dataset.split( transductive=not args.multigraph, split_ratio=[0.85, 0.05, 0.1]) print('after split') print('Train message-passing graph: {} nodes; {} edges.'.format( datasets['train'][0].num_nodes, datasets['train'][0].num_edges)) print('Val message-passing graph: {} nodes; {} edges.'.format( datasets['val'][0].num_nodes, datasets['val'][0].num_edges)) print('Test message-passing graph: {} nodes; {} edges.'.format( datasets['test'][0].num_nodes, datasets['test'][0].num_edges)) # node feature dimension input_dim = datasets['train'].num_node_features # link prediction needs 2 classes (0, 1) num_classes = datasets['train'].num_edge_labels model = Net(input_dim, num_classes, args).to(args.device) #optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) follow_batch = [] # e.g., follow_batch = ['edge_index'] dataloaders = {split: DataLoader( ds, collate_fn=Batch.collate(follow_batch), batch_size=args.batch_size, shuffle=(split=='train')) for split, ds in datasets.items()} print('Graphs after split: ') for key, dataloader in dataloaders.items(): for batch in dataloader: print(key, ': ', batch) train(model, dataloaders, optimizer, args, scheduler=scheduler)
def create_loader(datasets): loader_train = DataLoader(datasets[0], collate_fn=Batch.collate(), batch_size=cfg.train.batch_size, shuffle=True, num_workers=cfg.num_workers, pin_memory=False) loaders = [loader_train] for i in range(1, len(datasets)): loaders.append( DataLoader(datasets[i], collate_fn=Batch.collate(), batch_size=cfg.train.batch_size, shuffle=False, num_workers=cfg.num_workers, pin_memory=False)) return loaders
def test_hetero_graph_batch(self): G = generate_simple_hete_graph() hete = HeteroGraph(G) heteGraphDataset = [] for i in range(30): heteGraphDataset.append(hete.clone()) dataloader = DataLoader(heteGraphDataset, collate_fn=Batch.collate(), batch_size=3, shuffle=True) self.assertEqual(len(dataloader), math.ceil(30 / 3)) for data in dataloader: self.assertEqual(data.num_graphs, 3)
def gen_data_loaders(self, size, batch_size, train=True, use_distributed_sampling=False): loaders = [] for i in range(2): dataset = combined_syn.get_dataset( "graph", size // 2, np.arange(self.min_size + 1, self.max_size + 1)) sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank()) if \ use_distributed_sampling else None loaders.append( TorchDataLoader(dataset, collate_fn=Batch.collate([]), batch_size=batch_size // 2 if i == 0 else batch_size // 2, sampler=sampler, shuffle=False)) loaders.append([None] * (size // batch_size)) return loaders
def gen_data_loaders(self, size, batch_size, train=True, use_distributed_sampling=False): loaders = [] for i in range(2): neighs = [] for j in range(size // 2): graph, neigh = utils.sample_neigh( self.train_set if train else self.test_set, random.randint(self.min_size, self.max_size)) neighs.append(graph.subgraph(neigh)) dataset = GraphDataset(GraphDataset.list_to_graphs(neighs)) loaders.append( TorchDataLoader(dataset, collate_fn=Batch.collate([]), batch_size=batch_size // 2 if i == 0 else batch_size // 2, sampler=None, shuffle=False)) loaders.append([None] * (size // batch_size)) return loaders
name = 'BioSNAP-Function-Function' f = datadir + 'minerff.tsv' f2 = datadir + 'minerf.tsv' d = readFilePD(f, ['relation']) d2 = readFilePD(f2, ['namespace']) # label node feature as 'node feature' nxg = pdToNx2(d, d2, 'GO_id0', 'GO_id2', 'relation', 'GO_id1', 'namespace') dg = deepsnap.graph.Graph(nxg) graphs = dg dataset = GraphDataset(graphs, task='node') # node, edge, link_pred, graph dataset_train, dataset_val, dataset_test = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1]) # transductive split, inductive split train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=16) # basic data loader val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=16) # basic data loader test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=16) # basic data loader class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() # self.conv1 = GCNConv(dataset.num_node_features, 1) # self.conv2 = GCNConv(16, dataset.num_node_labels) self.conv1 = SplineConv(1, 16, dim=1, kernel_size=2) self.conv2 = SplineConv(16, 4, dim=1, kernel_size=2)
if accs[1] > best_val: best_val = accs[1] best_model = copy.deepcopy(model) return accs if __name__ == "__main__": cora_pyg = Planetoid('./cora', 'Cora') citeseer_pyg = Planetoid('./citeseer', 'CiteSeer') G = concatenate_citeseer_cora(cora_pyg[0], citeseer_pyg[0]) hete = HeteroGraph(G) print("Heterogeneous graph {} nodes, {} edges".format(hete.num_nodes, hete.num_edges)) dataset = GraphDataset([hete], task='node') dataset_train, dataset_val, dataset_test = dataset.split(transductive=True, split_ratio=[0.8, 0.1, 0.1]) train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=16) val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=16) test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=16) loaders = [train_loader, val_loader, test_loader] hidden_size = 32 model = HeteroNet(hete, hidden_size, 0.5).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-3) num_epochs = 100 train_accs, valid_accs, test_accs = [], [], [] for epoch in range(num_epochs):
def main(): args = arg_parse() edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) G = nx.read_gpickle(args.data_path) print(G.number_of_edges()) print('Each node has node ID (n_id). Example: ', G.nodes[0]) print( 'Each edge has edge ID (id) and categorical label (e_label). Example: ', G[0][5871]) # find num edge types max_label = 0 labels = [] for u, v, edge_key in G.edges: l = G[u][v][edge_key]['e_label'] if not l in labels: labels.append(l) # labels are consecutive (0-17) num_edge_types = len(labels) H = WN_transform(G, num_edge_types) # The nodes in the graph have the features: node_feature and node_type (just one node type "n1" here) for node in H.nodes(data=True): print(node) break # The edges in the graph have the features: edge_feature and edge_type ("0" - "17" here) for edge in H.edges(data=True): print(edge) break hetero = HeteroGraph(H) hetero = HeteroGraph(edge_index=hetero.edge_index, edge_feature=hetero.edge_feature, node_feature=hetero.node_feature, directed=hetero.is_directed()) if edge_train_mode == "disjoint": dataset = GraphDataset([hetero], task='link_pred', edge_train_mode=edge_train_mode, edge_message_ratio=args.edge_message_ratio) else: dataset = GraphDataset( [hetero], task='link_pred', edge_train_mode=edge_train_mode, ) dataset_train, dataset_val, dataset_test = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1]) train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=1) val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=1) test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=1) dataloaders = { 'train': train_loader, 'val': val_loader, 'test': test_loader } hidden_size = args.hidden_dim conv1, conv2 = generate_2convs_link_pred_layers(hetero, HeteroSAGEConv, hidden_size) model = HeteroGNN(conv1, conv2, hetero, hidden_size).to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) t_accu, v_accu, e_accu = train(model, dataloaders, optimizer, args)
# The edges in the graph have the features: edge_type ("cora_edge" or "citeseer_edge") print("The edges in the concatenated heterogeneous graph have the following features:") for edge in G.edges(data=True): print(edge[2]) break hete = HeteroGraph(G) print(f"Heterogeneous graph {hete.num_nodes()} nodes, {hete.num_edges()} edges") dataset = GraphDataset([hete], task='node') dataset_train, dataset_val, dataset_test = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1] ) train_loader = DataLoader( dataset_train, collate_fn=Batch.collate(), batch_size=16 ) val_loader = DataLoader( dataset_val, collate_fn=Batch.collate(), batch_size=16 ) test_loader = DataLoader( dataset_test, collate_fn=Batch.collate(), batch_size=16 ) loaders = [train_loader, val_loader, test_loader] hidden_size = 32 model = HeteroNet(hete, hidden_size, 0.5).to(device) optimizer = torch.optim.Adam( model.parameters(), lr=0.01, weight_decay=5e-3 ) num_epochs = 100
netlib=netlib) dataset = GraphDataset(graphs, task='node') # node, edge, link_pred, graph dataset_train, dataset_val, dataset_test = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1]) # transductive split, inductive split else: graphs_train, graphs_val, graphs_test = \ GraphDataset.pyg_to_graphs(pyg_dataset, verbose=True, fixed_split=True, netlib=netlib) dataset_train, dataset_val, dataset_test = \ GraphDataset(graphs_train, task='node'), GraphDataset(graphs_val,task='node'), \ GraphDataset(graphs_test, task='node') train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=16) # basic data loader val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=16) # basic data loader test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=16) # basic data loader num_node_features = dataset_train.num_node_features num_classes = dataset_train.num_node_labels train(train_loader, val_loader, test_loader, args, num_node_features, num_classes, args.device)
def main(): writer = SummaryWriter() args = arg_parse() edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) ppi_graph = read_ppi_data(args.ppi_path) mode = 'mixed' if mode == 'ppi': message_passing_graph = ppi_graph cmap_graph, knockout_nodes = read_cmap_data(args.data_path) elif mode == 'mixed': message_passing_graph, knockout_nodes = ( read_cmap_data(args.data_path, ppi_graph) ) print('Each node has gene ID. Example: ', message_passing_graph.nodes['ADPGK']) print('Each edge has de direction. Example', message_passing_graph['ADPGK']['IL1B']) print('Total num edges: ', message_passing_graph.number_of_edges()) # disjoint edge label disjoint_split_ratio = 0.1 val_ratio = 0.1 disjoint_edge_label_index = [] val_edges = [] # newly edited train_edges = [] for u in knockout_nodes: rand_num = np.random.rand() if rand_num < disjoint_split_ratio: # add all edges (cmap only) into edge label index # cmap is not a multigraph disjoint_edge_label_index.extend( [ (u, v, edge_key) for v in message_passing_graph.successors(u) for edge_key in message_passing_graph[u][v] if message_passing_graph[u][v][edge_key]['edge_type'] == 1 ] ) train_edges.extend( [ (u, v, edge_key) for v in message_passing_graph.successors(u) for edge_key in message_passing_graph[u][v] if message_passing_graph[u][v][edge_key]['edge_type'] == 1 ] ) elif rand_num < disjoint_split_ratio + val_ratio: val_edges.extend( [ (u, v, edge_key) for v in message_passing_graph.successors(u) for edge_key in message_passing_graph[u][v] if message_passing_graph[u][v][edge_key]['edge_type'] == 1 ] ) else: train_edges.extend( [ (u, v, edge_key) for v in message_passing_graph.successors(u) for edge_key in message_passing_graph[u][v] if message_passing_graph[u][v][edge_key]['edge_type'] == 1 ] ) # add default node types for message_passing_graph for node in message_passing_graph.nodes: message_passing_graph.nodes[node]['node_type'] = 0 print('Num edges to predict: ', len(disjoint_edge_label_index)) print('Num edges in val: ', len(val_edges)) print('Num edges in train: ', len(train_edges)) graph = HeteroGraph( message_passing_graph, custom={ "general_splits": [ train_edges, val_edges ], "disjoint_split": disjoint_edge_label_index, "task": "link_pred" } ) graphs = [graph] graphDataset = GraphDataset( graphs, task="link_pred", edge_train_mode="disjoint" ) # Transform dataset # de direction (currently using homogeneous graph) num_edge_types = 2 graphDataset = graphDataset.apply_transform( cmap_transform, num_edge_types=num_edge_types, deep_copy=False ) print('Number of node features: ', graphDataset.num_node_features()) # split dataset dataset = {} dataset['train'], dataset['val'] = graphDataset.split(transductive=True) # sanity check print(f"dataset['train'][0].edge_label_index.keys(): {dataset['train'][0].edge_label_index.keys()}") print(f"dataset['train'][0].edge_label_index[(0, 1, 0)].shape[1]: {dataset['train'][0].edge_label_index[(0, 1, 0)].shape[1]}") print(f"dataset['val'][0].edge_label_index.keys(): {dataset['val'][0].edge_label_index.keys()}") print(f"dataset['val'][0].edge_label_index[(0, 1, 0)].shape[1]: {dataset['val'][0].edge_label_index[(0, 1, 0)].shape[1]}") print(f"len(list(dataset['train'][0].G.edges)): {len(list(dataset['train'][0].G.edges))}") print(f"len(list(dataset['val'][0].G.edges)): {len(list(dataset['val'][0].G.edges))}") print(f"list(dataset['train'][0].G.edges)[:10]: {list(dataset['train'][0].G.edges)[:10]}") print(f"list(dataset['val'][0].G.edges)[:10]: {list(dataset['val'][0].G.edges)[:10]}") # node feature dimension input_dim = dataset['train'].num_node_features() edge_feat_dim = dataset['train'].num_edge_features() num_classes = dataset['train'].num_edge_labels() print( 'Node feature dim: {}; edge feature dim: {}; num classes: {}.'.format( input_dim, edge_feat_dim, num_classes ) ) exit() # relation type is both used for edge features and edge labels model = Net(input_dim, edge_feat_dim, num_classes, args).to(args.device) optimizer = torch.optim.Adam( model.parameters(), lr=0.001, weight_decay=5e-3 ) follow_batch = [] # e.g., follow_batch = ['edge_index'] dataloaders = { split: DataLoader( ds, collate_fn=Batch.collate(follow_batch), batch_size=1, shuffle=(split == 'train') ) for split, ds in dataset.items() } print('Graphs after split: ') for key, dataloader in dataloaders.items(): for batch in dataloader: print(key, ': ', batch) train(model, dataloaders, optimizer, args, writer=writer)
def main(): args = arg_parse() name = 'BioSNAP-FF' f = 'minerff.tsv' f2 = 'minerf.tsv' d = readFilePD(f, ['relation']) d2 = readFilePD(f2, ['namespace']) nxg = pdToNx3(d, d2, 'GO_id0', 'GO_id2', 'relation', 'GO_id1', 'namespace') dg = Graph(nxg) graphs = [dg] # the input that we assume users have edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) #graphs = GraphDataset(graphs) if args.multigraph: graphs = [copy.deepcopy(graphs[0]) for _ in range(10)] dataset = GraphDataset(graphs, task='link_pred', edge_message_ratio=args.edge_message_ratio, edge_train_mode=edge_train_mode) print('Initial dataset: {}'.format(dataset)) # split dataset datasets = {} datasets['train'], datasets['val'], datasets['test'] = dataset.split( transductive=not args.multigraph, split_ratio=[0.85, 0.05, 0.1]) print('after split') print('Train message-passing graph: {} nodes; {} edges.'.format( datasets['train'][0].G.number_of_nodes(), datasets['train'][0].G.number_of_edges())) print('Val message-passing graph: {} nodes; {} edges.'.format( datasets['val'][0].G.number_of_nodes(), datasets['val'][0].G.number_of_edges())) print('Test message-passing graph: {} nodes; {} edges.'.format( datasets['test'][0].G.number_of_nodes(), datasets['test'][0].G.number_of_edges())) # node feature dimension input_dim = 47410 #datasets['train'].num_node_features # link prediction needs 2 classes (0, 1) num_classes = datasets['train'].num_edge_labels #print('num_edge_labels',datasets['train'].num_edge_labels) model = Net(input_dim, num_classes, args).to(args.device) #optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) follow_batch = [] # e.g., follow_batch = ['edge_index'] dataloaders = { split: DataLoader(ds, collate_fn=Batch.collate(follow_batch), batch_size=args.batch_size, shuffle=(split == 'train')) for split, ds in datasets.items() } print('Graphs after split: ') for key, dataloader in dataloaders.items(): for batch in dataloader: print(key, ': ', batch) train(model, dataloaders, optimizer, args, scheduler=scheduler)
def main(): args = arg_parse() edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) WN_graph = nx.read_gpickle(args.data_path) print('Each node has node ID (n_id). Example: ', WN_graph.nodes[0]) print( 'Each edge has edge ID (id) and categorical label (e_label). Example: ', WN_graph[0][5871]) # Since both feature and label are relation types, # Only the disjoint mode would make sense dataset = GraphDataset( [WN_graph], task='link_pred', edge_train_mode=edge_train_mode, edge_message_ratio=args.edge_message_ratio, edge_negative_sampling_ratio=args.neg_sampling_ratio) # find num edge types max_label = 0 labels = [] for u, v, edge_key in WN_graph.edges: l = WN_graph[u][v][edge_key]['e_label'] if not l in labels: labels.append(l) # labels are consecutive (0-17) num_edge_types = len(labels) print('Pre-transform: ', dataset[0]) dataset = dataset.apply_transform(WN_transform, num_edge_types=num_edge_types, deep_copy=False) print('Post-transform: ', dataset[0]) print('Initial data: {} nodes; {} edges.'.format( dataset[0].G.number_of_nodes(), dataset[0].G.number_of_edges())) print('Number of node features: {}'.format(dataset.num_node_features)) # split dataset datasets = {} datasets['train'], datasets['val'], datasets['test'] = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1]) print('After split:') print('Train message-passing graph: {} nodes; {} edges.'.format( datasets['train'][0].G.number_of_nodes(), datasets['train'][0].G.number_of_edges())) print('Val message-passing graph: {} nodes; {} edges.'.format( datasets['val'][0].G.number_of_nodes(), datasets['val'][0].G.number_of_edges())) print('Test message-passing graph: {} nodes; {} edges.'.format( datasets['test'][0].G.number_of_nodes(), datasets['test'][0].G.number_of_edges())) # node feature dimension input_dim = datasets['train'].num_node_features edge_feat_dim = datasets['train'].num_edge_features num_classes = datasets['train'].num_edge_labels print( 'Node feature dim: {}; edge feature dim: {}; num classes: {}.'.format( input_dim, edge_feat_dim, num_classes)) # relation type is both used for edge features and edge labels model = Net(input_dim, edge_feat_dim, num_classes, args).to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3) follow_batch = [] # e.g., follow_batch = ['edge_index'] dataloaders = { split: DataLoader(ds, collate_fn=Batch.collate(follow_batch), batch_size=1, shuffle=(split == 'train')) for split, ds in datasets.items() } print('Graphs after split: ') for key, dataloader in dataloaders.items(): for batch in dataloader: print(key, ': ', batch) train(model, dataloaders, optimizer, args)
print("Use SnapX as the backend network library.") else: raise ValueError("{} network library is not supported.".format(args.netlib)) args.netlib = netlib graphs = GraphDataset.pyg_to_graphs(pyg_dataset, netlib=args.netlib) dataset = GraphDataset(graphs, task="graph") datasets = {} datasets['train'], datasets['val'], datasets['test'] = dataset.split( transductive=False, split_ratio = [0.8, 0.1, 0.1]) if args.transform_dataset is not None: trans_func = get_transform(args.transform_dataset) for _, dataset in datasets.items(): dataset.apply_transform(trans_func, radius=args.radius, netlib=args.netlib) dataloaders = { split: DataLoader( dataset, collate_fn=Batch.collate(), batch_size=args.batch_size, shuffle=True ) for split, dataset in datasets.items() } num_classes = datasets['train'].num_graph_labels num_node_features = datasets['train'].num_node_features train(dataloaders['train'], dataloaders['val'], dataloaders['test'], args, num_node_features, num_classes, args.device)