def __init__(self, args=None): dataset = "jknet_cora" path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) if not osp.exists(path): os.makedirs(path) super(CoraDataset, self).__init__(path) with open(self.processed_paths[0], 'rb') as fin: load_data = pickle.load(fin) self.num_nodes = load_data['node_num'] data = Data() data.x = load_data['xs'] data.y = load_data['ys'] train_size = int(self.num_nodes * 0.8) train_mask = np.zeros((self.num_nodes, ), dtype=bool) train_idx = np.random.choice(np.arange(self.num_nodes), size=train_size, replace=False) train_mask[train_idx] = True test_mask = np.ones((self.num_nodes, ), dtype=bool) test_mask[train_idx] = False val_mask = test_mask edges = load_data['edges'] edges = np.array(edges, dtype=int).transpose((1, 0)) data.edge_index = torch.from_numpy(edges) data.train_mask = torch.from_numpy(train_mask) data.test_mask = torch.from_numpy(test_mask) data.val_mask = torch.from_numpy(val_mask) data.x = torch.Tensor(data.x) data.y = torch.LongTensor(data.y) self.data = data self.num_classes = torch.max(self.data.y).item() + 1
def split_dataset(cls, dataset, args): if "ModelNet" in args.dataset: train_data = [Data(x=d.pos, y=d.y) for d in dataset["train"]] test_data = [Data(x=d.pos, y=d.y) for d in dataset["test"]] train_loader = DataLoader(train_data, batch_size=args.batch_size, num_workers=6) test_loader = DataLoader(test_data, batch_size=args.batch_size, num_workers=6, shuffle=False) return train_loader, test_loader, test_loader else: random.shuffle(dataset) train_size = int(len(dataset) * args.train_ratio) test_size = int(len(dataset) * args.test_ratio) bs = args.batch_size train_loader = DataLoader(dataset[:train_size], batch_size=bs) test_loader = DataLoader(dataset[-test_size:], batch_size=bs) if args.train_ratio + args.test_ratio < 1: valid_loader = DataLoader(dataset[train_size:-test_size], batch_size=bs) else: valid_loader = test_loader return train_loader, valid_loader, test_loader
def read_gatne_data(folder): train_data = {} with open(osp.join(folder, '{}'.format('train.txt')), 'r') as f: for line in f: items = line.strip().split() if items[0] not in train_data: train_data[items[0]] = [] train_data[items[0]].append([int(items[1]), int(items[2])]) valid_data = {} with open(osp.join(folder, '{}'.format('valid.txt')), 'r') as f: for line in f: items = line.strip().split() if items[0] not in valid_data: valid_data[items[0]] = [[], []] valid_data[items[0]][1 - int(items[3])].append( [int(items[1]), int(items[2])]) test_data = {} with open(osp.join(folder, '{}'.format('test.txt')), 'r') as f: for line in f: items = line.strip().split() if items[0] not in test_data: test_data[items[0]] = [[], []] test_data[items[0]][1 - int(items[3])].append( [int(items[1]), int(items[2])]) data = Data() data.train_data = train_data data.valid_data = valid_data data.test_data = test_data return data
def __init__(self, root, name1, name2): edge_index_1, dict_1, self.node2id_1 = self._preprocess(root, name1) edge_index_2, dict_2, self.node2id_2 = self._preprocess(root, name2) self.data = [ Data(x=None, edge_index=edge_index_1, y=dict_1), Data(x=None, edge_index=edge_index_2, y=dict_2), ] self.transform = None
def read_planetoid_data(folder, prefix): prefix = prefix.lower() names = ["x", "tx", "allx", "y", "ty", "ally", "graph", "test.index"] objects = [] for item in names[:-1]: with open(f"{folder}/ind.{prefix}.{item}", "rb") as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding="latin1")) else: objects.append(pkl.load(f)) test_index = parse_index_file(f"{folder}/ind.{prefix}.{names[-1]}") test_index = torch.Tensor(test_index).long() test_index_reorder = test_index.sort()[0] x, tx, allx, y, ty, ally, graph = tuple(objects) x, tx, allx = tuple( [torch.from_numpy(item.todense()).float() for item in [x, tx, allx]]) y, ty, ally = tuple( [torch.from_numpy(item).float() for item in [y, ty, ally]]) train_index = torch.arange(y.size(0), dtype=torch.long) val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long) if prefix.lower() == "citeseer": # There are some isolated nodes in the Citeseer graph, resulting in # none consecutive test indices. We need to identify them and add them # as zero vectors to `tx` and `ty`. len_test_indices = (test_index.max() - test_index.min()).item() + 1 tx_ext = torch.zeros(len_test_indices, tx.size(1)) tx_ext[test_index_reorder - test_index.min(), :] = tx ty_ext = torch.zeros(len_test_indices, ty.size(1)) ty_ext[test_index_reorder - test_index.min(), :] = ty tx, ty = tx_ext, ty_ext x = torch.cat([allx, tx], dim=0).float() y = torch.cat([ally, ty], dim=0).max(dim=1)[1].long() x[test_index] = x[test_index_reorder] y[test_index] = y[test_index_reorder] train_mask = index_to_mask(train_index, size=y.size(0)) val_mask = index_to_mask(val_index, size=y.size(0)) test_mask = index_to_mask(test_index, size=y.size(0)) edge_index = edge_index_from_dict(graph, num_nodes=y.size(0)) data = Data(x=x, edge_index=edge_index, y=y) data.train_mask = train_mask data.val_mask = val_mask data.test_mask = test_mask return data
def preprocessing(self, data, gdc_type="ppr"): # generate adjacency matrix from sparse representation adj_matrix = self._get_adj_matrix(data.x, data.edge_index) if gdc_type == "none": print("No GDC filters chosen") processed_matrix = adj_matrix elif gdc_type == "ppr": print("PPR filters chosen") processed_matrix = self._get_ppr_matrix(adj_matrix, alpha=self.alpha) elif gdc_type == "heat": print("Heat filters chosen") processed_matrix = self._get_heat_matrix(adj_matrix, t=self.t) else: raise ValueError if gdc_type == "ppr" or gdc_type == "heat": if self.k: print(f"Selecting top {self.k} edges per node.") processed_matrix = self._get_top_k_matrix(processed_matrix, k=self.k) elif self.eps: print(f"Selecting edges with weight greater than {self.eps}.") processed_matrix = self._get_clipped_matrix(processed_matrix, eps=self.eps) else: raise ValueError # create PyG Data object edges_i = [] edges_j = [] edge_attr = [] for i, row in enumerate(processed_matrix): for j in np.where(row > 0)[0]: edges_i.append(i) edges_j.append(j) edge_attr.append(processed_matrix[i, j]) edge_index = [edges_i, edges_j] data = Data( x=data.x, edge_index=torch.LongTensor(edge_index), edge_attr=torch.FloatTensor(edge_attr), y=data.y, train_mask=data.train_mask, test_mask=data.test_mask, val_mask=data.val_mask, ) data.apply(lambda x: x.to(self.device)) return data
def __init__(self, root, name): self.name = name super(GCCDataset, self).__init__(root) name1 = name.split("_")[0] name2 = name.split("_")[1] edge_index_1, dict_1, self.node2id_1 = self.preprocess(root, name1) edge_index_2, dict_2, self.node2id_2 = self.preprocess(root, name2) self.data = [ Data(x=None, edge_index=edge_index_1, y=dict_1), Data(x=None, edge_index=edge_index_2, y=dict_2), ] self.transform = None
def read_gtn_data(self, folder): data = sio.loadmat(osp.join(folder, 'data.mat')) if self.name == 'han-acm' or self.name == 'han-imdb': truelabels, truefeatures = data['label'], data['feature'].astype(float) elif self.name == 'han-dblp': truelabels, truefeatures = data['label'], data['features'].astype(float) num_nodes = truefeatures.shape[0] if self.name == 'han-acm': rownetworks = [data['PAP'] - np.eye(num_nodes), data['PLP'] - np.eye(num_nodes)] elif self.name == 'han-dblp': rownetworks = [data['net_APA'] - np.eye(num_nodes), data['net_APCPA'] - np.eye(num_nodes), data['net_APTPA'] - np.eye(num_nodes)] elif self.name == 'han-imdb': rownetworks = [data['MAM'] - np.eye(num_nodes), data['MDM'] - np.eye(num_nodes), data['MYM'] - np.eye(num_nodes)] y = truelabels train_idx = data['train_idx'] val_idx = data['val_idx'] test_idx = data['test_idx'] train_mask = sample_mask(train_idx, y.shape[0]) val_mask = sample_mask(val_idx, y.shape[0]) test_mask = sample_mask(test_idx, y.shape[0]) y_train = np.argmax(y[train_mask, :], axis=1) y_val = np.argmax(y[val_mask, :], axis=1) y_test = np.argmax(y[test_mask, :], axis=1) data = Data() A = [] for i, edge in enumerate(rownetworks): edge_tmp = torch.from_numpy(np.vstack((edge.nonzero()[0], edge.nonzero()[1]))).type(torch.LongTensor) value_tmp = torch.ones(edge_tmp.shape[1]).type(torch.FloatTensor) A.append((edge_tmp, value_tmp)) edge_tmp = torch.stack((torch.arange(0,num_nodes), torch.arange(0,num_nodes))).type(torch.LongTensor) value_tmp = torch.ones(num_nodes).type(torch.FloatTensor) A.append((edge_tmp, value_tmp)) data.adj = A data.x = torch.from_numpy(truefeatures).type(torch.FloatTensor) data.train_node = torch.from_numpy(train_idx[0]).type(torch.LongTensor) data.train_target = torch.from_numpy(y_train).type(torch.LongTensor) data.valid_node = torch.from_numpy(val_idx[0]).type(torch.LongTensor) data.valid_target = torch.from_numpy(y_val).type(torch.LongTensor) data.test_node = torch.from_numpy(test_idx[0]).type(torch.LongTensor) data.test_target = torch.from_numpy(y_test).type(torch.LongTensor) self.data = data
def setup_class(self): self.dataset = build_dataset_from_name("cora") self.data = Data.from_pyg_data(self.dataset[0]) self.num_nodes = self.data.num_nodes self.num_edges = self.data.num_edges self.num_features = self.data.num_features print("Call Setup")
def __init__(self, args): super(GraphClassification, self).__init__(args) dataset = build_dataset(args) self.data = [ Data(x=data.x, y=data.y, edge_index=data.edge_index, edge_attr=data.edge_attr, pos=data.pos).apply(lambda x:x.cuda()) for data in dataset ] args.num_features = dataset.num_features args.num_classes = dataset.num_classes args.use_unsup = False if args.degree_feature: self.data = node_degree_as_feature(self.data) args.num_features = self.data[0].num_features model = build_model(args) self.model = model.cuda() self.patience = args.patience self.max_epoch = args.max_epoch self.train_loader, self.val_loader, self.test_loader = self.model.split_dataset(dataset, args) self.optimizer = torch.optim.Adam( self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay ) self.scheduler = torch.optim.lr_scheduler.StepLR( optimizer=self.optimizer, step_size=50, gamma=0.5 )
def process(self): filenames = self.raw_paths with open(f"{filenames[0]}", "r") as f: edge_index = f.read().strip().split("\n") edge_index = [[int(i) for i in x.split("\t")] for x in edge_index] edge_index = np.array(edge_index, dtype=np.int64).transpose() edge_index = torch.from_numpy(edge_index) rev_edge_index = torch.stack([edge_index[1], edge_index[0]]) edge_index = torch.cat((edge_index, rev_edge_index), dim=1) self_loop_mask = edge_index[0] != edge_index[1] edge_index = edge_index[:, self_loop_mask] with open(f"{filenames[1]}", "r") as f: cmty = f.read().strip().split("\n") cmty = [[int(i) for i in x.split("\t")] for x in cmty] num_classes = len(cmty) num_nodes = torch.max(edge_index).item() + 1 labels = np.zeros((num_nodes, num_classes), dtype=np.float) for i, cls in enumerate(cmty): labels[cls, i] = 1.0 labels = torch.from_numpy(labels) data = Data(x=None, y=labels, edge_index=edge_index) torch.save(data, self.processed_paths[0])
def __init__(self, data_type="unsupervised", root="testchem", transform=None, pre_transform=None, pre_filter=None): super(TestChemDataset, self).__init__(root, transform, pre_transform, pre_filter) num_nodes = 10 num_edges = 10 num_graphs = 100 def cycle_index(num, shift): arr = torch.arange(num) + shift arr[-shift:] = torch.arange(shift) return arr upp = torch.cat([torch.arange(0, num_nodes)] * num_graphs) dwn = torch.cat([cycle_index(num_nodes, 1)] * num_graphs) edge_index = torch.stack([upp, dwn]) edge_attr = torch.zeros(num_edges * num_graphs, 2) x = torch.zeros(num_graphs * num_nodes, 2) for idx, val in enumerate( torch.randint(0, 6, size=(num_edges * num_graphs, ))): edge_attr[idx][0] = val for idx, val in enumerate( torch.randint(0, 3, size=(num_edges * num_graphs, ))): edge_attr[idx][1] = val for idx, val in enumerate( torch.randint(0, 120, size=(num_edges * num_graphs, ))): x[idx][0] = val for idx, val in enumerate( torch.randint(0, 3, size=(num_edges * num_graphs, ))): x[idx][1] = val self.data = Data( x=x.to(torch.long), edge_index=edge_index.to(torch.long), edge_attr=edge_attr.to(torch.long), ) self.slices = { "x": torch.arange(0, (num_graphs + 1) * num_nodes, num_nodes), "edge_index": torch.arange(0, (num_graphs + 1) * num_edges, num_edges), "edge_attr": torch.arange(0, (num_graphs + 1) * num_edges, num_edges), } if data_type == "supervised": pretrain_tasks = 10 go_target_pretrain = torch.zeros(pretrain_tasks * num_graphs) - 1 for i in range(num_graphs): val = np.random.randint(0, pretrain_tasks) go_target_pretrain[i * pretrain_tasks + val] = 1 self.data.y = go_target_pretrain self.slices["y"] = torch.arange(0, (num_graphs + 1) * pretrain_tasks, pretrain_tasks)
def process(self): for s, split in enumerate(['train', 'valid', 'test']): path = osp.join(self.raw_dir, '{}_graph.json').format(split) with open(path, 'r') as f: G = nx.DiGraph(json_graph.node_link_graph(json.load(f))) x = np.load(osp.join(self.raw_dir, '{}_feats.npy').format(split)) x = torch.from_numpy(x).to(torch.float) y = np.load(osp.join(self.raw_dir, '{}_labels.npy').format(split)) y = torch.from_numpy(y).to(torch.float) data_list = [] path = osp.join(self.raw_dir, '{}_graph_id.npy').format(split) idx = torch.from_numpy(np.load(path)).to(torch.long) idx = idx - idx.min() for i in range(idx.max().item() + 1): mask = idx == i G_s = G.subgraph(mask.nonzero().view(-1).tolist()) edge_index = torch.tensor(list(G_s.edges)).t().contiguous() edge_index = edge_index - edge_index.min() edge_index, _ = remove_self_loops(edge_index) data = Data(edge_index=edge_index, x=x[mask], y=y[mask]) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[s])
def __init__(self): super(TestSmallDataset, self).__init__("test") x = torch.FloatTensor([[-2, -1], [-2, 1], [-1, 0], [0, 0], [0, 1], [1, 0], [2, 1], [3, 0], [2, -1], [4, 0], [4, 1], [5, 0]]) edge_index = torch.LongTensor([ [ 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 11 ], [ 1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 3, 6, 7, 8, 5, 7, 5, 6, 8, 9, 5, 7, 7, 10, 11, 9, 11, 9, 10 ], ]) y = torch.LongTensor([0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3]) self.data = Data(x, edge_index, None, y, None) self.data.train_mask = torch.tensor([ True, False, False, True, False, True, False, False, False, True, False, False ]) self.data.val_mask = torch.tensor([ False, True, False, False, False, False, True, False, False, False, False, True ]) self.data.test_mask = torch.tensor([ False, False, True, False, True, False, False, True, True, False, True, False ]) # self.num_classes = 4 self.transform = None
def __init__(self, root, name): dataset = NodePropPredDataset(name, root) graph, y = dataset[0] x = torch.tensor(graph["node_feat"]) y = torch.tensor(y.squeeze()) row, col, edge_attr = coalesce(graph["edge_index"][0], graph["edge_index"][1], graph["edge_feat"]) edge_index = torch.stack([row, col], dim=0) edge_index, edge_attr = remove_self_loops(edge_index, edge_attr) row = torch.cat([edge_index[0], edge_index[1]]) col = torch.cat([edge_index[1], edge_index[0]]) edge_index = torch.stack([row, col], dim=0) if edge_attr is not None: edge_attr = torch.cat([edge_attr, edge_attr], dim=0) self.data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) self.data.num_nodes = graph["num_nodes"] assert self.data.num_nodes == self.data.x.shape[0] # split split_index = dataset.get_idx_split() self.data.train_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.test_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.val_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.train_mask[split_index["train"]] = True self.data.test_mask[split_index["test"]] = True self.data.val_mask[split_index["valid"]] = True self.transform = None
def read_edgelist_label_data(folder, prefix): graph_path = osp.join(folder, '{}.ungraph'.format(prefix)) cmty_path = osp.join(folder, '{}.cmty'.format(prefix)) G = nx.read_edgelist(graph_path, nodetype=int, create_using=nx.Graph()) num_node = G.number_of_nodes() print('edge number: ', num_node) with open(graph_path) as f: context = f.readlines() print('edge number: ', len(context)) edge_index = np.zeros((2, len(context))) for i, line in enumerate(context): edge_index[:, i] = list(map(int, line.strip().split('\t'))) edge_index = torch.from_numpy(edge_index).to(torch.int) with open(cmty_path) as f: context = f.readlines() print('class number: ', len(context)) label = np.zeros((num_node, len(context))) for i, line in enumerate(context): line = map(int, line.strip().split('\t')) for node in line: label[node, i] = 1 y = torch.from_numpy(label).to(torch.float) data = Data(x=None, edge_index=edge_index, y=y) return data
def __init__(self, root, name): self.name = name edge_list_path = os.path.join(root, name + ".edgelist") node_label_path = os.path.join(root, name + ".nodelabel") edge_index, y, self.node2id = self._preprocess(edge_list_path, node_label_path) self.data = Data(x=None, edge_index=edge_index, y=y) self.transform = None
def process(self): num_nodes = 100 num_edges = 300 feat_dim = 30 # load or generate your dataset edge_index = torch.randint(0, num_nodes, (2, num_edges)) x = torch.randn(num_nodes, feat_dim) y = torch.randint(0, 2, (num_nodes, )) # set train/val/test mask in node_classification task train_mask = torch.zeros(num_nodes).bool() train_mask[0:int(0.3 * num_nodes)] = True val_mask = torch.zeros(num_nodes).bool() val_mask[int(0.3 * num_nodes):int(0.7 * num_nodes)] = True test_mask = torch.zeros(num_nodes).bool() test_mask[int(0.7 * num_nodes):] = True data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask) torch.save(data, "mydata.pt") return data
def get(self, idx): data = Data() for key in self.data.keys: item, slices = self.data[key], self.slices[key] s = list(repeat(slice(None), item.dim())) s[self.data.cat_dim(key, item)] = slice(slices[idx], slices[idx + 1]) data[key] = item[s] return data
def process(self): edge=self.read_txt_label(osp.join(self.raw_dir, '{}.txt'.format(self.name)),dtype=torch.int) edge_index=edge[:-1,:] edge_attr=edge[-1:,:] data = Data(edge_index=edge_index,edge_attr=edge_attr, x=None, y=None) #data = Data(edge_index=edge_index, x=None, y=None) data = data if self.pre_transform is None else self.pre_transform(data) torch.save(data, self.processed_paths[0])
def read_triplet_data(folder): filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"] count = 0 edge_index = [] edge_attr = [] count_list = [] for filename in filenames: with open(osp.join(folder, filename), "r") as f: num = int(f.readline().strip()) for line in f: items = line.strip().split() edge_index.append([int(items[0]), int(items[1])]) edge_attr.append(int(items[2])) count += 1 count_list.append(count) edge_index = torch.LongTensor(edge_index).t() edge_attr = torch.LongTensor(edge_attr) data = Data() data.edge_index = edge_index data.edge_attr = edge_attr def generate_mask(start, end): mask = torch.BoolTensor(count) mask[:] = False mask[start:end] = True return mask data.train_mask = generate_mask(0, count_list[0]) data.val_mask = generate_mask(count_list[0], count_list[1]) data.test_mask = generate_mask(count_list[1], count_list[2]) return data
def process(self): data = np.load(osp.join(self.raw_dir, "reddit_data.npz")) x = torch.from_numpy(data["feature"]).to(torch.float) y = torch.from_numpy(data["label"]).to(torch.long) split = torch.from_numpy(data["node_types"]) adj = sp.load_npz(osp.join(self.raw_dir, "reddit_graph.npz")) row = torch.from_numpy(adj.row).to(torch.long) col = torch.from_numpy(adj.col).to(torch.long) edge_index = torch.stack([row, col], dim=0) edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0)) data = Data(x=x, edge_index=edge_index, y=y) data.train_mask = split == 1 data.val_mask = split == 2 data.test_mask = split == 3 torch.save(self.collate([data]), self.processed_paths[0])
def top_k(self, x: torch.Tensor, edge_index: torch.Tensor, scores: torch.Tensor) -> Tuple[Data, torch.Tensor]: org_n_nodes = x.shape[0] num = int(self.pooling_rate * x.shape[0]) values, indices = torch.topk(scores, max(2, num)) if self.aug_adj: edge_attr = torch.ones(edge_index.shape[1]).to(x.device) edge_index, _ = spspmm(edge_index, edge_attr, edge_index, edge_attr, org_n_nodes, org_n_nodes, org_n_nodes) batch = Data(x=x, edge_index=edge_index) new_batch = batch.subgraph(indices) num_nodes = new_batch.x.shape[0] new_batch.edge_attr = row_normalization(num_nodes, new_batch.edge_index) return new_batch, indices
def get_subgraph(self, phase, require_norm=True): """ Generate one minibatch for model. In the 'train' mode, one minibatch corresponds to one subgraph of the training graph. In the 'valid' or 'test' mode, one batch corresponds to the full graph (i.e., full-batch rather than minibatch evaluation for validation / test sets). Inputs: mode str, can be 'train', 'valid', 'test' require_norm boolean Outputs: data Data object, modeling the sampled subgraph data.norm_aggr aggregation normalization data.norm_loss normalization normalization """ if phase in ['val', 'test']: node_subgraph = np.arange(self.data.num_nodes) data = self.data if require_norm: data.norm_aggr = torch.ones(self.num_edges) data.norm_loss = self.norm_loss_test else: if len(self.subgraphs_nodes) == 0: self.gen_subgraph() node_subgraph = self.subgraphs_nodes.pop() edge_subgraph = self.subgraphs_edge_index.pop() num_nodes_subgraph = node_subgraph.size adj = sp.csr_matrix( (self.subgraphs_data.pop(), self.subgraphs_indices.pop(), self.subgraphs_indptr.pop()), shape=(num_nodes_subgraph, num_nodes_subgraph)) if require_norm: adj.data[:] = self.norm_aggr_train[edge_subgraph][:] #normalization D = adj.sum(1).flatten() norm_diag = sp.dia_matrix((1 / D, 0), shape=adj.shape) adj = norm_diag.dot(adj) adj.sort_indices() adj = adj.tocoo() data = Data( self.data.x[node_subgraph], torch.LongTensor(np.vstack( (adj.row, adj.col))), None if self.data.edge_attr is None else self.data.edge_attr[edge_subgraph], self.data.y[node_subgraph], None if self.data.pos is None else self.data.pos[node_subgraph]) if require_norm: data.norm_aggr = torch.FloatTensor(adj.data) data.norm_loss = self.norm_loss_train[node_subgraph] data.train_mask = self.data.train_mask[node_subgraph] data.val_mask = self.data.val_mask[node_subgraph] data.test_mask = self.data.test_mask[node_subgraph] return data
def __init__(self, args=None): x = torch.FloatTensor([[-2, -1], [-2, 1], [-1, 0], [0, 0], [0, 1], [1, 0], [2, 1], [3, 0], [2, -1]]) edge_index = torch.LongTensor([[0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8], [1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 3, 6, 7, 8, 5, 7, 5, 6, 8, 5, 7]]) y = torch.LongTensor([0, 0, 0, 1, 1, 2, 2, 2, 2]) self.data = Data(x, edge_index, None, y, None) self.data.train_mask = torch.tensor([True, False, False, True, False, True, False, False, False]) self.data.val_mask = torch.tensor([False, True, False, False, False, False, True, False, False]) self.data.test_mask = torch.tensor([False, False, True, False, True, False, False, True, True]) self.num_classes = 3 self.transform = None
def read_saint_data(folder): names = [ "adj_full.npz", "adj_train.npz", "class_map.json", "feats.npy", "role.json" ] names = [osp.join(folder, name) for name in names] adj_full = sp.load_npz(names[0]) adj_train = sp.load_npz(names[1]) class_map = json.load(open(names[2])) feats = np.load(names[3]) role = json.load(open(names[4])) train_mask = index_to_mask(role["tr"], size=feats.shape[0]) val_mask = index_to_mask(role["va"], size=feats.shape[0]) test_mask = index_to_mask(role["te"], size=feats.shape[0]) feats = torch.from_numpy(feats).float() item = class_map["0"] if isinstance(item, list): labels = np.zeros((feats.shape[0], len(item)), dtype=float) for key, val in class_map.items(): labels[int(key)] = np.array(val) else: labels = np.zeros(feats.shape[0], dtype=np.long) for key, val in class_map.items(): labels[int(key)] = val labels = torch.from_numpy(labels) def get_adj(adj): row, col = adj.nonzero() data = adj.data row = torch.tensor(row, dtype=torch.long) col = torch.tensor(col, dtype=torch.long) edge_index = torch.stack([row, col], dim=0) edge_attr = torch.tensor(data, dtype=torch.float) return edge_index, edge_attr edge_index_full, edge_attr_full = get_adj(adj_full) edge_index_train, edge_attr_train = get_adj(adj_train) data = Data( x=feats, y=labels, edge_index=edge_index_full, edge_attr=edge_attr_full, edge_index_train=edge_index_train, edge_attr_train=edge_attr_train, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask, ) return data
def read_gtn_data(self, folder): edges = pickle.load(open(osp.join(folder, 'edges.pkl'), 'rb')) labels = pickle.load(open(osp.join(folder, 'labels.pkl'), 'rb')) node_features = pickle.load( open(osp.join(folder, 'node_features.pkl'), 'rb')) data = Data() data.x = torch.from_numpy(node_features).type(torch.FloatTensor) num_nodes = edges[0].shape[0] A = [] for i, edge in enumerate(edges): edge_tmp = torch.from_numpy( np.vstack((edge.nonzero()[0], edge.nonzero()[1]))).type(torch.LongTensor) value_tmp = torch.ones(edge_tmp.shape[1]).type(torch.FloatTensor) A.append((edge_tmp, value_tmp)) edge_tmp = torch.stack( (torch.arange(0, num_nodes), torch.arange(0, num_nodes))).type(torch.LongTensor) value_tmp = torch.ones(num_nodes).type(torch.FloatTensor) A.append((edge_tmp, value_tmp)) data.adj = A data.train_node = torch.from_numpy(np.array(labels[0])[:, 0]).type( torch.LongTensor) data.train_target = torch.from_numpy(np.array(labels[0])[:, 1]).type( torch.LongTensor) data.valid_node = torch.from_numpy(np.array(labels[1])[:, 0]).type( torch.LongTensor) data.valid_target = torch.from_numpy(np.array(labels[1])[:, 1]).type( torch.LongTensor) data.test_node = torch.from_numpy(np.array(labels[2])[:, 0]).type( torch.LongTensor) data.test_target = torch.from_numpy(np.array(labels[2])[:, 1]).type( torch.LongTensor) self.data = data
def read_planetoid_data(folder, prefix): names = ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index'] items = [read_file(folder, prefix, name) for name in names] x, tx, allx, y, ty, ally, graph, test_index = items train_index = torch.arange(y.size(0), dtype=torch.long) val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long) sorted_test_index = test_index.sort()[0] if prefix.lower() == 'citeseer': # There are some isolated nodes in the Citeseer graph, resulting in # none consecutive test indices. We need to identify them and add them # as zero vectors to `tx` and `ty`. len_test_indices = (test_index.max() - test_index.min()).item() + 1 tx_ext = torch.zeros(len_test_indices, tx.size(1)) tx_ext[sorted_test_index - test_index.min(), :] = tx ty_ext = torch.zeros(len_test_indices, ty.size(1)) ty_ext[sorted_test_index - test_index.min(), :] = ty tx, ty = tx_ext, ty_ext x = torch.cat([allx, tx], dim=0) y = torch.cat([ally, ty], dim=0).max(dim=1)[1] x[test_index] = x[sorted_test_index] y[test_index] = y[sorted_test_index] train_mask = sample_mask(train_index, num_nodes=y.size(0)) val_mask = sample_mask(val_index, num_nodes=y.size(0)) test_mask = sample_mask(test_index, num_nodes=y.size(0)) edge_index = edge_index_from_dict(graph, num_nodes=y.size(0)) data = Data(x=x, edge_index=edge_index, y=y) data.train_mask = train_mask data.val_mask = val_mask data.test_mask = test_mask return data
def read_triplet_data(folder): filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"] count = 0 edge_index = [] edge_attr = [] count_list = [] triples = [] num_entities = 0 num_relations = 0 entity_dic = {} relation_dic = {} for filename in filenames: with open(osp.join(folder, filename), "r") as f: _ = int(f.readline().strip()) if "train" in filename: train_start_idx = len(triples) elif "valid" in filename: valid_start_idx = len(triples) elif "test" in filename: test_start_idx = len(triples) for line in f: items = line.strip().split() edge_index.append([int(items[0]), int(items[1])]) edge_attr.append(int(items[2])) triples.append((int(items[0]), int(items[2]), int(items[1]))) if items[0] not in entity_dic: entity_dic[items[0]] = num_entities num_entities += 1 if items[1] not in entity_dic: entity_dic[items[1]] = num_entities num_entities += 1 if items[2] not in relation_dic: relation_dic[items[2]] = num_relations num_relations += 1 count += 1 count_list.append(count) edge_index = torch.LongTensor(edge_index).t() edge_attr = torch.LongTensor(edge_attr) data = Data() data.edge_index = edge_index data.edge_attr = edge_attr def generate_mask(start, end): mask = torch.BoolTensor(count) mask[:] = False mask[start:end] = True return mask data.train_mask = generate_mask(0, count_list[0]) data.val_mask = generate_mask(count_list[0], count_list[1]) data.test_mask = generate_mask(count_list[1], count_list[2]) return data, triples, train_start_idx, valid_start_idx, test_start_idx, num_entities, num_relations
def process(self): path = osp.join(self.raw_dir, "{}.mat".format(self.name)) smat = scipy.io.loadmat(path) adj_matrix, group = smat["network"], smat["group"] y = torch.from_numpy(group.todense()).to(torch.float) row_ind, col_ind = adj_matrix.nonzero() edge_index = torch.stack([torch.tensor(row_ind), torch.tensor(col_ind)], dim=0) edge_attr = torch.tensor(adj_matrix[row_ind, col_ind]) data = Data(edge_index=edge_index, edge_attr=edge_attr, x=None, y=y) torch.save(data, self.processed_paths[0])