def gat_layer(self, input, adj, genPath=False, eluF=True): N = input.size()[0] edge = adj._indices() h = torch.mm(input, self.W) h = h+self.bias # h: N x out # Self-attention on the nodes - Shared attention mechanism edge_h = torch.cat((h[edge[0, :], :], h[edge[1, :], :]), dim=1).t() # edge_h: 2*D x E edge_att = self.a.mm(edge_h).squeeze() edge_e_a = self.leakyrelu(edge_att) # edge_e_a: E attetion score for each edge if genPath: with torch.no_grad(): edge_weight = edge_e_a p_a_e = edge_weight - scatter_max(edge_weight, edge[0,:], dim=0, dim_size=N)[0][edge[0,:]] p_a_e = p_a_e.exp() p_a_e = p_a_e / (scatter_add(p_a_e, edge[0,:], dim=0, dim_size=N)[edge[0,:]]\ +torch.Tensor([9e-15]).cuda()) scisp = convert.to_scipy_sparse_matrix(edge, p_a_e, N) scipy.sparse.save_npz(os.path.join(genPath, 'attmat_{:s}.npz'.format(self.layerN)), scisp) edge_e = torch.exp(edge_e_a - torch.max(edge_e_a)) # edge_e: E e_rowsum = spmm(edge, edge_e, N, torch.ones(size=(N,1)).cuda()) # e_rowsum: N x 1 edge_e = self.dropout(edge_e) # add dropout improve from 82.4 to 83.8 # edge_e: E h_prime = spmm(edge, edge_e, N, h) h_prime = h_prime.div(e_rowsum+torch.Tensor([9e-15]).cuda()) # h_prime: N x out if self.concat and eluF: return F.elu(h_prime) else: return h_prime
def forward(self, nodes, adjs): edge, _ = dense_to_sparse(adjs) x = self.sage1(nodes, edge) s = self.sage2(nodes, edge) s = torch.reshape(s, (1, nodes.size(0), 128)) x = torch.reshape(x, (1, nodes.size(0), 128)) adjs = torch.reshape(adjs, (1, nodes.size(0), nodes.size(0))) x, edge, link_loss1, ent_loss1 = dense_diff_pool(x, adjs, s) x = torch.reshape(x, (128, 128)) edge = torch.reshape(edge, (128, 128)) #for i in range(edge.size(0)): # edge[i,:] = torch.where(edge[i,:] == torch.max(edge[i,:]),torch.ones(1,128).cuda(), torch.zeros(1,128).cuda()) edge_out = edge edge, _ = dense_to_sparse(edge) #nodes_out = x x = self.sage3(x, edge) nodes_out = torch.tanh(x) #x = self.sage4(nodes_out, edge) edge = torch.Tensor( convert.to_scipy_sparse_matrix(edge).todense()).cuda() edge = torch.reshape(edge, (1, 128, 128)) x = torch.reshape(x, (1, 128, 2)) s = torch.ones(1, 128, 1).cuda() x, edge, link_loss2, ent_loss2 = dense_diff_pool(x, edge, s) x = x.reshape(-1) link_loss = link_loss1 + link_loss2 ent_loss = ent_loss1 + ent_loss2 #print(x.shape, edge.shape) #print(asd) """ x_out = torch.reshape(x, (128,2)) edge = torch.reshape(edge, (128,128)) for i in range(edge.size(0)): edge[i,:] = torch.where(edge[i,:] == torch.max(edge[i,:]),torch.ones(1,128).cuda(), torch.zeros(1,128).cuda()) edge, _ = dense_to_sparse(edge) x = self.sage3(x_out, edge) x = torch.reshape(x, (128,)) """ return x, link_loss, ent_loss, nodes_out, edge_out
def load_data(args, datapath): if args.dataset in ['arxiv'] and args.task == 'lp': data = {} dataset = PygNodePropPredDataset(name='ogbn-{}'.format(args.dataset), root='/pasteur/u/jeffgu/hgcn/data') split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] induced_edges_train, _ = subgraph(train_idx, dataset[0].edge_index) induced_edges_valid, _ = subgraph(valid_idx, dataset[0].edge_index) induced_edges_test, _ = subgraph(test_idx, dataset[0].edge_index) neg_edges_train = negative_sampling(induced_edges_train) neg_edges_valid = negative_sampling(induced_edges_valid) neg_edges_test = negative_sampling(induced_edges_test) data['adj_train'] = to_scipy_sparse_matrix( dataset[0].edge_index).tocsr() data['features'] = dataset[0].x data['train_edges'], data[ 'train_edges_false'] = induced_edges_train, neg_edges_train data['val_edges'], data[ 'val_edges_false'] = induced_edges_valid, neg_edges_valid data['test_edges'], data[ 'test_edges_false'] = induced_edges_test, neg_edges_test elif args.task == 'nc': data = load_data_nc(args.dataset, args.use_feats, datapath, args.split_seed) else: data = load_data_lp(args.dataset, args.use_feats, datapath) adj = data['adj_train'] if args.task == 'lp': adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_edges( adj, args.val_prop, args.test_prop, args.split_seed) data['adj_train'] = adj_train data['train_edges'], data[ 'train_edges_false'] = train_edges, train_edges_false data['val_edges'], data[ 'val_edges_false'] = val_edges, val_edges_false data['test_edges'], data[ 'test_edges_false'] = test_edges, test_edges_false data['adj_train_norm'], data['features'] = process(data['adj_train'], data['features'], args.normalize_adj, args.normalize_feats) if args.dataset == 'airport': data['features'] = augment(data['adj_train'], data['features']) return data
def load_data_nc(dataset, use_feats, data_path, split_seed): if dataset in ['cora', 'pubmed']: adj, features, labels, idx_train, idx_val, idx_test = load_citation_data( dataset, use_feats, data_path, split_seed) elif dataset == 'arxiv': dataset = PygNodePropPredDataset(name='ogbn-arxiv', root='/pasteur/u/jeffgu/hgcn/data') split_idx = dataset.get_idx_split() idx_train, idx_val, idx_test = split_idx["train"], split_idx[ "valid"], split_idx["test"] adj = to_scipy_sparse_matrix(dataset[0].edge_index).tocsr() features = dataset[0].x labels = dataset[0].y else: if dataset == 'disease_nc': adj, features, labels = load_synthetic_data( dataset, use_feats, data_path) val_prop, test_prop = 0.10, 0.60 elif dataset == 'airport': adj, features, labels = load_data_airport(dataset, data_path, return_label=True) val_prop, test_prop = 0.15, 0.15 else: raise FileNotFoundError( 'Dataset {} is not supported.'.format(dataset)) idx_val, idx_test, idx_train = split_data(labels, val_prop, test_prop, seed=split_seed) labels = torch.LongTensor(labels) data = { 'adj_train': adj, 'features': features, 'labels': labels, 'idx_train': idx_train, 'idx_val': idx_val, 'idx_test': idx_test } return data
def main(): parser = ArgumentParser(description="GraphZoom") parser.add_argument("-d", "--dataset", type=str, default="arxiv", \ help="input dataset") parser.add_argument("-o", "--coarse", type=str, default="lamg", \ help="choose either simple_coarse or lamg_coarse, [simple, lamg]") parser.add_argument("-c", "--mcr_dir", type=str, default="/opt/matlab/R2018A/", \ help="directory of matlab compiler runtime (only required by lamg_coarsen)") parser.add_argument("-s", "--search_ratio", type=int, default=12, \ help="control the search space in graph fusion process (only required by lamg_coarsen)") parser.add_argument("-r", "--reduce_ratio", type=int, default=2, \ help="control graph coarsening levels (only required by lamg_coarsen)") parser.add_argument("-v", "--level", type=int, default=1, \ help="number of coarsening levels (only required by simple_coarsen)") parser.add_argument("-n", "--num_neighs", type=int, default=2, \ help="control k-nearest neighbors in graph fusion process") parser.add_argument("-l", "--lda", type=float, default=0.1, \ help="control self loop in adjacency matrix") parser.add_argument("-e", "--embed_path", type=str, default="./embed_results/", \ help="path of embedding result") parser.add_argument("-m", "--embed_method", type=str, default="node2vec", \ help="graph embedding method") parser.add_argument("-f", "--fusion", default=True, action="store_false", \ help="whether use graph fusion") parser.add_argument("-p", "--power", default=False, action="store_true", \ help="Strong power of graph filter, set True to enhance filter power") parser.add_argument("-g", "--sage_model", type=str, default="mean", \ help="aggregation function in graphsage") parser.add_argument("-w", "--sage_weighted", default=True, action="store_false", \ help="whether consider weighted reduced graph") args = parser.parse_args() dataset = args.dataset feature_path = "dataset/{}/{}-feats.npy".format(dataset, dataset) fusion_input_path = "dataset/{}/{}.mtx".format(dataset, dataset) reduce_results = "./reduction_results/" mapping_path = "{}Mapping.mtx".format(reduce_results) d = PygNodePropPredDataset(name=f"ogbn-{dataset}") os.makedirs(reduce_results, exist_ok=True) os.makedirs(f"dataset/{dataset}", exist_ok=True) if args.fusion: coarsen_input_path = "dataset/{}/fused_{}.mtx".format(dataset, dataset) else: coarsen_input_path = "dataset/{}/{}.mtx".format(dataset, dataset) ######Load Data###### print("%%%%%% Loading Graph Data %%%%%%") lp_index, lp_weight = get_laplacian(to_undirected(d[0].edge_index, d[0].num_nodes)) laplacian = to_scipy_sparse_matrix(lp_index, lp_weight) if args.coarse == "lamg": if os.path.exists(fusion_input_path): print("Laplacian matrix in mtx already exists.") else: print("Saving laplacian matrix in mtx...") file = open(fusion_input_path, "wb") mmwrite(fusion_input_path, laplacian) file.close() ## whether node features are required if args.fusion: feature = d[0].x.numpy() ######Graph Fusion###### if args.fusion: print("%%%%%% Starting Graph Fusion %%%%%%") fusion_start = time.process_time() laplacian = graph_fusion(laplacian, feature, args.num_neighs, args.mcr_dir, args.coarse,\ fusion_input_path, args.search_ratio, reduce_results, mapping_path, dataset) fusion_time = time.process_time() - fusion_start ######Graph Reduction###### print("%%%%%% Starting Graph Reduction %%%%%%") reduce_start = time.process_time() if args.coarse == "simple": G, projections, laplacians, level = sim_coarse(laplacian, args.level) reduce_time = time.process_time() - reduce_start elif args.coarse == "lamg": os.system('./run_coarsening.sh {} {} {} n {}'.format(args.mcr_dir, \ coarsen_input_path, args.reduce_ratio, reduce_results)) reduce_time = read_time("{}CPUtime.txt".format(reduce_results)) G = mtx2graph("{}Gs.mtx".format(reduce_results)) level = read_levels("{}NumLevels.txt".format(reduce_results)) projections, laplacians = construct_proj_laplacian(laplacian, level, reduce_results) else: raise NotImplementedError edge_index = torch.tensor(list(G.edges)).t().contiguous().view(2, -1) edge_index = to_undirected(edge_index, len(G.nodes())) ######Embed Reduced Graph###### print("%%%%%% Starting Graph Embedding %%%%%%") if args.embed_method == "node2vec": embed_start = time.process_time() embeddings = node2vec(edge_index) else: raise NotImplementedError embed_time = time.process_time() - embed_start ######Refinement###### print("%%%%%% Starting Graph Refinement %%%%%%") refine_start = time.process_time() embeddings = refinement(level, projections, laplacians, embeddings, args.lda, args.power) refine_time = time.process_time() - refine_start ######Save Embeddings###### os.makedirs(args.embed_path, exist_ok=True) np.save(args.embed_path + "embeddings.npy", embeddings) ######Report timing information###### print("%%%%%% CPU time %%%%%%") if args.fusion: total_time = fusion_time + reduce_time + embed_time + refine_time print(f"Graph Fusion Time: {fusion_time:.3f}") else: total_time = reduce_time + embed_time + refine_time print("Graph Fusion Time: 0") print(f"Graph Reduction Time: {reduce_time:.3f}") print(f"Graph Embedding Time: {embed_time:.3f}") print(f"Graph Refinement Time: {refine_time:.3f}") print(f"Total Time = Fusion_time + Reduction_time + Embedding_time + Refinement_time = {total_time:.3f}")
def forward(self, inputs): x = F.relu(self.conv1(inputs)) x = F.max_pool2d(x, 2, 2) x = F.relu(self.conv2(x)) x = F.max_pool2d(x, 2, 2) x = F.relu(self.conv3(x)) x = F.max_pool2d(x, 2, 2) x = F.relu(self.conv4(x)) x = F.max_pool2d(x, 2, 2) x = F.relu(self.conv5(x)) x = F.max_pool2d(x, 2, 2) #print(x) org = torch.reshape(x, (256, 256)) edge = torch.Tensor(ori_adjacen).long().t().contiguous().cuda() x = self.sage1(org, edge) s = self.sage2(org, edge) s = torch.reshape(s, (1, 256, 128)) x = torch.reshape(x, (1, 256, 128)) edge = torch.Tensor( convert.to_scipy_sparse_matrix(edge).todense()).cuda() edge = torch.reshape(edge, (1, 256, 256)) x, edge, link_loss1, ent_loss1 = dense_diff_pool(x, edge, s) #x = torch.tanh(x) x = torch.reshape(x, (128, 128)) edge = torch.reshape(edge, (128, 128)) edge_out = edge for i in range(edge_out.size(0)): edge_out[i, :] = torch.where( edge_out[i, :] == torch.max(edge_out[i, :]), torch.ones(1, 128).cuda(), torch.zeros(1, 128).cuda()) edge, _ = dense_to_sparse(edge) #nodes_out = x x = self.sage3(x, edge) nodes_out = torch.tanh(x) x = nodes_out #x = self.sage4(nodes_out, edge) edge_dense = edge edge = torch.Tensor( convert.to_scipy_sparse_matrix(edge).todense()).cuda() #print(edge) #print(asd) edge = torch.reshape(edge, (1, 128, 128)) x = torch.reshape(x, (1, 128, 2)) s = torch.ones(1, 128, 1).cuda() x, edge, link_loss2, ent_loss2 = dense_diff_pool(x, edge, s) x = x.reshape(-1) link_loss = link_loss1 + link_loss2 ent_loss = ent_loss1 + ent_loss2 return x, link_loss, ent_loss, nodes_out, edge_out, edge_dense
def main(): parser = argparse.ArgumentParser(description='Prepare data for Giant-XRT') parser.add_argument( '--raw-text-path', type=str, required=True, help="Path of raw text (.txt file, each raw correspond to a node)") parser.add_argument( '--vectorizer-config-path', type=str, required=True, help="a path to a json file that specify the tfidf hyper-paramters") parser.add_argument('--data-root-dir', type=str, default="./dataset") parser.add_argument('--xrt-data-dir', type=str, default="./proc_data_xrt") parser.add_argument('--dataset', type=str, default="ogbn-arxiv") parser.add_argument('--max-deg', type=int, default=1000) args = parser.parse_args() print(args) # Change args.save_data_dir to args.save_data_dir/args.dataset save_data_dir = os.path.join(args.xrt_data_dir, args.dataset) dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_root_dir) data = dataset[0] edge_index = data.edge_index # Make sure edge_index is undirected!!! if not is_undirected(edge_index): edge_index = to_undirected(edge_index) # Filtering nodes whose number of edges >= max_degree Degree = degree(edge_index[0]) Filtered_idx = torch.where(Degree < args.max_deg)[0] print('Number of original nodes:{}'.format(data.x.shape[0])) print('Number of filtered nodes:{}'.format(len(Filtered_idx))) # # Construct and save label matrix (adjacencey matrix) Y. Y_csr_all = smat.csr_matrix(to_scipy_sparse_matrix(edge_index)) Y_csr_trn = Y_csr_all[Filtered_idx] smat_util.save_matrix(f"{save_data_dir}/Y.trn.npz", Y_csr_trn) smat_util.save_matrix(f"{save_data_dir}/Y.all.npz", Y_csr_all) print("Saved Y.trn.npz and Y.all.npz") # Apply the same filtering for raw text with open(args.raw_text_path, "r") as fin: node_text_list = fin.readlines() print("|node_text_list={}".format(len(node_text_list))) count = 0 with open(f"{save_data_dir}/X.trn.txt", "w") as fout: for cur_idx, line in enumerate(node_text_list): if Filtered_idx[count].item() == cur_idx: fout.writelines(line) count += 1 assert count == len(Filtered_idx), "count={}, len(Filtered_idx)={}".format( count, len(Filtered_idx)) print("Saved X.trn.txt") # Apply the same filtering for tfidf features vectorizer_config = Vectorizer.load_config_from_args( args) # using args.vectorizer_config_path preprocessor = Preprocessor.train(node_text_list, vectorizer_config, dtype=np.float32) preprocessor.save(f"{save_data_dir}/tfidf-model") X_tfidf_all = preprocessor.predict(node_text_list) X_tfidf_trn = X_tfidf_all[Filtered_idx] smat_util.save_matrix(f"{save_data_dir}/X.all.tfidf.npz", X_tfidf_all) smat_util.save_matrix(f"{save_data_dir}/X.trn.tfidf.npz", X_tfidf_trn) print("Saved X.trn.npz and X.all.npz")