def _build_graph(self, in_data, partition): st_anndata, st_coords = in_data genes = np.array(st_anndata.var_names) gene_to_node = {val: ind for ind, val in enumerate(genes)} num_genes = len(genes) part_mask = np.array([i in partition for i in st_anndata.obs_names]) cells = np.array(st_anndata.obs_names[part_mask]) cell_to_node = {val: ind + num_genes for ind, val in enumerate(cells)} num_cells = len(cells) coords = torch.tensor([st_coords[i] for i in cells]) # print(coords) #### # print(partition) #### # print(st_anndata.obs_names) #### coords_dims = coords.shape[1] coords_pad = torch.cat((torch.full((num_genes, coords_dims), 0), coords), 0).float() cell_mask = torch.cat((torch.full((num_genes,), False), torch.full((num_cells,), True)), 0) expr_orig = np.log(st_anndata.X[part_mask,:] + 1) expr = np.vstack((np.zeros((num_genes, num_genes),), expr_orig),) expr_sparse_cg = sparse.coo_matrix(np.nan_to_num(expr / expr.sum(axis=0, keepdims=1))) edges_cg, edge_features_cg = from_scipy_sparse_matrix(expr_sparse_cg) expr_sparse_gc = sparse.coo_matrix(np.nan_to_num((expr / expr.sum(axis=1, keepdims=1)).T)) edges_gc, edge_features_gc = from_scipy_sparse_matrix(expr_sparse_gc) node_in_channels = 2 * num_genes x = torch.zeros(num_genes + num_cells, num_genes + num_genes) x[:num_genes,:num_genes].fill_diagonal_(1.) x[num_genes:,num_genes:] = torch.tensor(expr_orig).float() node_to_id = np.concatenate((genes, cells)) edges = torch.cat((edges_cg, edges_gc), 1) edge_attr = torch.cat((edge_features_cg, edge_features_gc), 0).float() edge_type = torch.cat( (torch.zeros_like(edge_features_cg, dtype=torch.long), torch.ones_like(edge_features_gc, dtype=torch.long)), 0 ) node_index_orig = torch.arange(x.shape[0]) data = Data(x=x, edge_index=edges, edge_attr=edge_attr, edge_type=edge_type, pos=coords_pad, cell_mask=cell_mask, node_index_orig=node_index_orig) print(data) #### maps = { "gene_to_node": gene_to_node, "cell_to_node": cell_to_node, "node_to_id": node_to_id, } return data, maps, node_in_channels
def get_jaccard(adjacency_matrix: torch.Tensor, features: torch.Tensor, threshold: int = 0.01): """Jaccard similarity edge filtering as proposed in Huijun Wu, Chen Wang, Yuriy Tyshetskiy, Andrew Docherty, Kai Lu, and Liming Zhu. Adversarial examples for graph data: Deep insights into attack and defense. Parameters ---------- adjacency_matrix : torch.Tensor Sparse [n,n] adjacency matrix. features : torch.Tensor Dense [n,d] feature matrix. threshold : int, optional Similarity threshold for filtering, by default 0. Returns ------- torch.Tensor Preprocessed adjacency matrix. """ row, col = adjacency_matrix._indices().cpu() values = adjacency_matrix._values().cpu() N = adjacency_matrix.shape[0] if features.is_sparse: features = features.to_dense() modified_adj = sp.coo_matrix((values.numpy(), (row.numpy(), col.numpy())), (N, N)) modified_adj = drop_dissimilar_edges(features.cpu().numpy(), modified_adj, threshold=threshold) modified_adj = torch.sparse.FloatTensor(*from_scipy_sparse_matrix(modified_adj)).to(adjacency_matrix.device) return modified_adj
def test_from_scipy_sparse_matrix(): edge_index = torch.tensor([[0, 1, 0], [1, 0, 0]]) adj = to_scipy_sparse_matrix(edge_index) out = from_scipy_sparse_matrix(adj) assert out[0].tolist() == edge_index.tolist() assert out[1].tolist() == [1, 1, 1]
def load_ppi(): mat = loadmat(PPI) ei = from_scipy_sparse_matrix(mat['network'])[0] y = torch.tensor(mat['group'].todense(), dtype=torch.long) X = torch.eye(y.size()[0]) return Data(x=X, y=y, edge_index=ei)
def predict(adj, features): edge_index, edge_weight = from_scipy_sparse_matrix(adj) edge_index, edge_weight = edge_index.cuda(), edge_weight.float().cuda() features = torch.from_numpy(features).float().cuda() n, d = features.shape print('Data loaded') model = APPNPModel(n_features=d, n_classes=19, hidden_dimensions=[128], alpha=0.01, do_use_dropout_for_propagation=True).cuda() model.load_state_dict(torch.load(MODEL_CHECKPOINT)) # with open(MODEL_FILE, 'rb') as fp: # model = pickle.load(fp).cuda() model.eval() print('Model loaded') logits = model(features, edge_index, edge_weight) print('Prediction finished') test_out = (logits.argmax(-1) + 1).cpu().numpy() # test_out = test_out.cpu() # test_out = test_out.numpy() return test_out
def load_pyg_data(fdict): """pygデータを返す関数""" adj, attr, labels = load_pkl(fdict, with_test=True) # preprocess 特徴量 empty_features_inx = np.where(np.all(attr == 0, axis=1)) attr, remove_list = preprocess_feat(attr) # preprocess 隣接行列 adj = preprocess_adj(adj, remove_list) # preprocess ラベル #labels = preprocess_labels(labels, reverse=False) # PytorchのTensor型に変換する edge_index, edge_attr = from_scipy_sparse_matrix(adj) x = torch.tensor(attr, dtype=torch.float) #y = torch.tensor(labels, dtype=torch.long) data = Data( x=x, # y=y, edge_index=edge_index, edge_weight=edge_attr, num_class=18) # 特徴量が0のインデックスを取得 data.empty_features_inx = empty_features_inx print(f"Detectded ALL 0 features !", data.empty_features_inx) return data
def transform(A, X, labels): # PytorchのTensor型に変換する edge_index, edge_attr = from_scipy_sparse_matrix(A) print(type(X), type(labels)) x = torch.tensor(X, dtype=torch.float) y = torch.tensor(labels, dtype=torch.long) # PygのDataクラスを作成 data = Data(x=x, y=y, edge_index=edge_index, edge_weight=edge_attr) data.num_class = len(np.unique(y)) return data
def prepare_dataset(X_data, y_data, go_vocab, aa_vocab): dataset = [] for data, raw_labels in zip(X_data, y_data): x = prepare_sequence(data[0], aa_vocab, 3000).float() edge_index = from_scipy_sparse_matrix(data[1])[0] labels = torch.tensor([go_vocab[y] for y in raw_labels]) targets = torch.zeros((len(go_vocab))) targets[labels] = 1 targets = torch.tensor(targets) dataset.append(Data(x=x, edge_index=edge_index, y=targets)) return dataset
def createGraphDataset(X, y): """ Create the graph dataset for the multi-channel signals """ Xg = [] for i in range(X.shape[0]): W = X[i] A = coo_matrix(W) edge_index, edge_attr = from_scipy_sparse_matrix(A) edge_index, edge_attr = remove_self_loops(edge_index, edge_attr) x = torch.tensor(np.identity(8), dtype=torch.float) yg = torch.tensor([y[i]], dtype=torch.long) data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=yg) Xg.append(data) return Xg
def create_graph(node_features, labels, edges, edge_attr=None) -> Graph: assert len(node_features) == len(labels) node_features = torch.tensor(node_features, dtype=torch.float) labels = torch.tensor(labels, dtype=torch.long) if isinstance(edges, np.ndarray): edge_index = torch.tensor(edges, dtype=torch.long).transpose(0, 1) elif isinstance(edges, sparse.spmatrix): edge_index, edge_weight = from_scipy_sparse_matrix(edges) edge_weight = edge_weight.float() else: raise ValueError('numpy(n, 2) or scipy.sparse adj') data = Data(node_features, edge_index=edge_index, edge_weight=edge_weight, y=labels) dgl_graph = DGLGraph((data.edge_index[0], data.edge_index[1])) # TODO make DGLGraph lazy init return Graph(data, dgl_graph)
def build_graph(self, features_list, tdms): '''get edge_index for GATLayer''' edge_index_list, edge_attr_list = [], [] for features, tdm in zip(features_list, tdms): features = features.cpu() cosine_matrix = 1 - pairwise_distances(tdm, metric="cosine") G = nx.from_numpy_matrix(cosine_matrix) # Graph to SparseMatrix G = nx.to_scipy_sparse_matrix(G) # sparse Matrix to Graph edge_index, edge_attr = from_scipy_sparse_matrix(G) edge_index_list.append(edge_index) edge_attr_list.append(edge_attr) return edge_index_list, edge_attr_list
def deal_with_mat(self): """ 将.mat 转化为 [Data] :return: DataList: [Data] """ print("dealing with mat...") m = loadmat(self.raw_paths[0]) A = utils.from_scipy_sparse_matrix(m['network']) att = torch.from_numpy(m['attributes'].todense().astype(np.float32)) y = torch.from_numpy(m['labels'].reshape(-1)).to(torch.long) # 如果y最小值不是0,则认为idx从1开始 if int(torch.min(y)) != 0: y -= 1 dt = tgd.Data(x=att, edge_index=A[0], edge_weight=A[1].to(torch.float32), y=y) # print(dt) return [dt]
def graph_clustering(A_matrix,method,n_clusters,ratio=None,graph_num=None,plotting=True,Mean=False): if(graph_num==None): graph_num = random.randint(1,len(A_matrix))-1 if(Mean): graph_num = 0; A_matrix = np.mean(A_matrix,axis=0,keepdims=True) n = A_matrix.shape[1] if(method=='kmeans'): #kmeans on first n vectors with nonzero eigenvalues _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False) kmeans = KMeans(n_clusters=n_clusters) kmeans.fit(vecs[:,1:n_clusters].reshape(-1,n_clusters-1)) if(ratio==None): return kmeans.labels_ num = np.sum(kmeans.labels_) ind = 0 if num>(n//2) else 1 prob = (kmeans.fit_transform(vecs[:,1:n_clusters].reshape(-1,n_clusters-1))) thresh = np.quantile(prob[:,ind], ratio) return (prob[:,ind] >= thresh) elif(method=='Spectral_clustering'): adjacency_matrix = A_matrix[graph_num].reshape(n,n) sc = SpectralClustering(n_clusters, affinity='precomputed', n_init=100, assign_labels='discretize') Class = sc.fit_predict(adjacency_matrix) if(plotting): Ab_matrix = A_binarize(A_matrix) G = nx.Graph(Ab_matrix[graph_num]) plt.figure(); nx.draw(G, node_size=200, pos=nx.spring_layout(G)); plt.show() plt.figure(); nx.draw(G, node_color=Class, node_size=200, pos=nx.spring_layout(G)); plt.show() return Class elif(method=='Affinity_propagation'): _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False) clustering = AffinityPropagation().fit(vecs[:,1:n_clusters]) elif(method=='Agglomerative_clustering'): _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False) clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(vecs[:,1:n_clusters].reshape(-1,n_clusters-1)) elif(method=='Graclus'): sA = sparse.csr_matrix(A_matrix[graph_num]) edge_index, edge_weight = g_utils.from_scipy_sparse_matrix(sA) cluster = graclus_cluster(edge_index[0], edge_index[1], edge_weight) return cluster.numpy() else: raise Exception("non-existing clustering method") return clustering.labels_
def train_model(log=False): extract = utils.from_scipy_sparse_matrix(mat) G = data.Data(edge_index=extract[0], edge_attr=extract[1], x=x, y=y) edge_index = G.edge_index num_feat = 5 num_graph_conv_layers = 2 graph_conv_embed_sizes = 256 num_lin_layers = 3 lin_hidden_sizes = 256 num_classes = 2 model = GCN(num_feat, num_graph_conv_layers, graph_conv_embed_sizes, num_lin_layers, lin_hidden_sizes, num_classes) model.load_state_dict( torch.load(load_model_file, map_location=torch.device('cpu'))) model.eval() return model, x, y, edge_index
def process(self): mat = loadmat(os.path.join(self.raw_dir, self.raw_file_names)) features = pd.DataFrame(mat['local_info'][:, :-1], columns=self.targets) if self.target == 'year': features.loc[(features['year'] < 2004) | (features['year'] > 2009), 'year'] = 0 y = torch.from_numpy(LabelEncoder().fit_transform( features[self.target])) if 0 in features[self.target].values: y = y - 1 x = features.drop(columns=self.target).replace({0: pd.NA}) x = torch.tensor(pd.get_dummies(x).values, dtype=torch.float) edge_index = from_scipy_sparse_matrix(mat['A'])[0] data = Data(x=x, edge_index=edge_index, y=y, num_nodes=len(y)) if self.pre_transform is not None: data = self.pre_transform(data) torch.save(self.collate([data]), self.processed_paths[0])
def sbm_2_geometric(self, data): ''' Transform sbm data to geometric data ''' # PyTorch Geometric Dataset dataset = [] # Traverse graphs for graph in tqdm(data): # ##GET EDGE MATRIX W = graph['W'] W = scipy.sparse.csr_matrix(W) edge_index, edge_weight = utils.from_scipy_sparse_matrix(W) # ##GET NODE MATRIX # Node feature matrix in PyTorch Geometric format x = [] features = graph['node_feat'] for feature in features: x.append([feature]) # x must be torch tensor and float(Conv layer returns error if not) x = torch.tensor(x).float() # ##GET Y(labels) # Node labels matrix in PyTorch Geometric format (Same format) # y must be long because loss functions want it y = graph['node_label'].long() # Create PyTorch geometric graph data = Data(x=x, edge_index=edge_index, y=y) dataset.append(data) return dataset
def process(self): x = np.load(os.path.join(self.raw_dir, 'airport_features.pkl'), allow_pickle=True) y = np.load(os.path.join(self.raw_dir, 'airport_labels.pkl'), allow_pickle=True) adj = np.load(os.path.join(self.raw_dir, 'airport_adj.pkl'), allow_pickle=True) edge_index, _ = from_scipy_sparse_matrix(adj) train, val, test = np.load(os.path.join(self.raw_dir, 'airport_tvt_nids.pkl'), allow_pickle=True) train_mask = torch.zeros_like(y, dtype=torch.bool) val_mask = torch.zeros_like(y, dtype=torch.bool) test_mask = torch.zeros_like(y, dtype=torch.bool) train_mask[train] = True val_mask[val] = True test_mask[test] = True data = Data( x=x, edge_index=edge_index, y=y, num_nodes=len(y), train_mask=train_mask, val_mask=val_mask, test_mask=test_mask ) if self.pre_transform is not None: data = self.pre_transform(data) torch.save(self.collate([data]), self.processed_paths[0])
def metapath2vec(fp, PARAMS): """[function to generate metapath2vec] Args: fp ([string]): [the file path of the root of the data] PARAMS ([dict]): [the parameters of the node2vec model, KEYS:{ GRAPH_NAME: the name of the graph file EMBEDDING_DIM: dimension of embedding, WALK_LENGTH: random walk length, CONTEXT_SIZE: context size, WALKS_PER_NODE: number of walks per node, NUM_NEG_SAMPLES: number of negative samples, LEARNING_RATE: learning rate, BATCH_SIZE: batch size of each batch, NUM_EPOCH: number of epoch to be trained, CUDA: use GPU }] Returns: [np.array]: [numpy array format of the metapath2vec embedding] """ g = io.loadmat(osp.join(fp, 'interim', 'graph', PARAMS['GRAPH_NAME'])) user_user = from_scipy_sparse_matrix(g['U']) author_post = from_scipy_sparse_matrix(g['A']) post_user = from_scipy_sparse_matrix(g['P']) data = Data(edge_index_dict={ ('user', 'replied by', 'user'): user_user[0], ('user', 'wrote', 'post'): author_post[0], ('post', 'commented by', 'user'): post_user[0], }, num_nodes_dict={ 'post': g['post_indx'].shape[1], 'user': g['user_indx'].shape[1] }) if PARAMS['CUDA']: device = 'cuda' if torch.cuda.is_available() else 'cpu' else: device = 'cpu' model = MetaPath2Vec(data.edge_index_dict, embedding_dim=PARAMS['EMBEDDING_DIM'], metapath=metapath, walk_length=PARAMS['WALK_LENGTH'], context_size=PARAMS['CONTEXT_SIZE'], walks_per_node=PARAMS['WALKS_PER_NODE'], num_negative_samples=PARAMS['NUM_NEG_SAMPLES'], sparse=True).to(device) losses = [] if not PARAMS["TEST"]: loader = model.loader(batch_size=PARAMS['BATCH_SIZE'], shuffle=True, num_workers=8) optimizer = torch.optim.SparseAdam(model.parameters(), lr=PARAMS['LEARNING_RATE']) def train(epoch, log_steps=100): model.train() total_loss = 0 store = [] i = 1 loading = iter(loader) while loading != None: try: pos_rw, neg_rw = next(loading) except IndexError: continue except StopIteration: loading = None optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() total_loss += loss.item() if (i + 1) % log_steps == 0: print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, ' f'Loss: {total_loss / log_steps:.4f}')) store.append(total_loss / log_steps) total_loss = 0 i += 1 return store for epoch in range(1, PARAMS['NUM_EPOCH'] + 1): losses.append(train(epoch)) model.eval() with torch.no_grad(): z = model('post').detach().cpu().numpy() if not os.path.exists(os.path.join(fp, 'processed', 'metapath2vec')): os.makedirs(os.path.join(fp, 'processed', 'metapath2vec'), exist_ok=True) with open( osp.join(fp, 'processed', 'metapath2vec', PARAMS['EMBEDDING_NAME'] + 'log.json'), 'w') as f: json.dump({'loss': losses}, f) np.save( osp.join(fp, 'processed', 'metapath2vec', PARAMS['EMBEDDING_NAME']), z) print('successfully saved embedding') return z
def topology_graph(): adj = np.zeros((2400, 2400)) for i in range(800): for j in range(i + 1, 800): z = np.random.randint(0, 100, dtype=int) if z > 96: # 0.03 adj[i, j] = 1 adj[j, i] = 1 for i in range(800, 1600): for j in range(i + 1, 1600): z = np.random.randint(0, 100, dtype=int) if z > 96: # 0.03 adj[i, j] = 1 adj[j, i] = 1 for i in range(1600, 2400): for j in range(i + 1, 2400): z = np.random.randint(0, 100, dtype=int) if z > 96: # 0.03 adj[i, j] = 1 adj[j, i] = 1 for i in range(800): for j in range(800, 1600): z = np.random.randint(0, 10000, dtype=int) if z > 9999: # 0.0001 adj[i, j] = 1 adj[j, i] = 1 for i in range(800): for j in range(1600, 2400): z = np.random.randint(0, 10000, dtype=int) if z > 9999: # 0.00001 adj[i, j] = 1 adj[j, i] = 1 for i in range(800, 1600): for j in range(1600, 2400): z = np.random.randint(0, 10000, dtype=int) if z > 9999: # 0.00001 adj[i, j] = 1 adj[j, i] = 1 arr_sparse = sparse.coo_matrix(adj) edge_index, _ = from_scipy_sparse_matrix(arr_sparse) edge_index = edge_index.long() torch.save(edge_index, './synthdata/topology/edge_index.pt') dim = 20 mask_convariance_maxtix = np.diag([1 for _ in range(dim)]) center1 = 2.5 * np.random.random(size=dim) - 1 center2 = 2.5 * np.random.random(size=dim) - 1 center3 = 2.5 * np.random.random(size=dim) - 1 data1 = multivariate_normal.rvs(mean=center1, cov=mask_convariance_maxtix, size=800) data2 = multivariate_normal.rvs(mean=center2, cov=mask_convariance_maxtix, size=800) data3 = multivariate_normal.rvs(mean=center3, cov=mask_convariance_maxtix, size=800) data = np.vstack((data1, data2, data3)) label = np.array([0 for _ in range(800)] + [1 for _ in range(800)] + [2 for _ in range(800)]) x, y = torch.from_numpy(data), torch.from_numpy(label) x, y = x.float(), y.long() torch.save(x, './synthdata/topology/x.pt') torch.save(y, './synthdata/topology/y.pt')
def infomax(fp, PARAMS, feature): """[generate DGI embedding] Args: fp ([string]): [file path of the root of the data] PARAMS ([dict]): [the parameters of the node2vec model, KEYS: { GRAPH_NAME: the name of the graph file SUMMARY: dimension of embedding, HIDDENCHANNELS: the hidden channel of encoder LEARNING_RATE: learning rate, BATCH_SIZE: batch size of each batch, NUM_EPOCH: number of epoch to be trained, CUDA: use GPU }] feature ([np.array]): [the node features] Returns: [np.array]: [numpy array format of the DGI embedding] """ g = io.loadmat(osp.join(fp, 'interim', 'graph', PARAMS['GRAPH_NAME'])) N = g['N'] p_cate = feature post_indx = g['post_indx'] edge_idx, x = from_scipy_sparse_matrix(N) x = x.view(-1, 1).float() feature = np.zeros((x.shape[0], p_cate.shape[1])) feature[post_indx, :] = p_cate x = torch.cat([x, torch.FloatTensor(feature)], 1) data = Data(x=x, edge_index=edge_idx) if PARAMS['CUDA']: device = 'cuda' if torch.cuda.is_available() else 'cpu' else: device = 'cpu' data = data.to(device) model = DeepGraphInfomax( hidden_channels=PARAMS['HIDDEN_CHANNELS'], encoder=Encoder(data.x.shape[1], PARAMS['SUMMARY']), summary=lambda z, *args, **kwargs: torch.sigmoid(z.mean(dim=0)), corruption=corruption).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=PARAMS['LEARNING_RATE']) def train(): model.train() optimizer.zero_grad() pos_z, neg_z, summary = model(data.x, data.edge_index) loss = model.loss(pos_z, neg_z, summary) loss.backward() optimizer.step() return loss.item() losses = [] for epoch in range(1, PARAMS['NUM_EPOCH'] + 1): loss = train() losses.append(loss) print('Epoch: {:03d}, Loss: {:.4f}'.format(epoch, loss)) model.eval() with torch.no_grad(): z, _, _ = model(data.x, data.edge_index) if not os.path.exists(os.path.join(fp, 'processed', 'infomax')): os.mkdir(os.path.join(fp, 'processed', 'infomax')) with open( osp.join(fp, 'processed', 'infomax', PARAMS['EMBEDDING_NAME'] + 'log.json'), 'w') as f: json.dump({'loss': losses}, f) z = z.detach().cpu().numpy()[post_indx.reshape(-1, ), :] np.save(osp.join(fp, 'processed', 'infomax', PARAMS['EMBEDDING_NAME']), z) print('embedding infomax created') return z
dic2 = sio.loadmat( 'graph_data/subject_1001_Ashwin/smoothSignal_test_graph_topologies.mat') X2 = dic2['W'] y2 = dic2['y'].squeeze() ## creating the graph dataset using the TMA first order terms of each channel graphTrainDataset = [] for i in range(X_train.shape[0]): tma = X_train[i, ...].squeeze() x = torch.tensor(tma, dtype=torch.float) y = torch.tensor([y_train[i]], dtype=torch.long) W = X1[i] A = coo_matrix(W) edge_index, edge_attr = from_scipy_sparse_matrix(A) edge_index, edge_attr = remove_self_loops(edge_index, edge_attr) data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) graphTrainDataset.append(data) graphTestDataset = [] for i in range(X_test.shape[0]): tma = X_test[i, ...].squeeze() x = torch.tensor(tma, dtype=torch.float) y = torch.tensor([y_test[i]], dtype=torch.long) W = X2[i] A = coo_matrix(W) edge_index, edge_attr = from_scipy_sparse_matrix(A) edge_index, edge_attr = remove_self_loops(edge_index, edge_attr) data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
def node2vec(fp, PARAMS): """[generate node2vec embedding] Args: fp ([string]): [the file path of the root of the data] PARAMS ([dict]): [the parameters of the node2vec model, KEYS: { GRAPH_NAME: the name of the graph file EMBEDDING_DIM: dimension of embedding, WALK_LENGTH: random walk length, CONTEXT_SIZE: context size, WALKS_PER_NODE: number of walks per node, P: P parameter of node2vec, Q: Q parameter of node2vec, LEARNING_RATE: learning rate, BATCH_SIZE: batch size of each batch, NUM_EPOCH: number of epoch to be trained, CUDA: use GPU }] Returns: [np.array]: [the numpy array format of embedding] """ N = io.loadmat(osp.join(fp, 'interim', 'graph', PARAMS['GRAPH_NAME']))['N'] edge_idx, x = from_scipy_sparse_matrix(N) post_indx = io.loadmat( osp.join(fp, 'interim', 'graph', PARAMS['GRAPH_NAME']))['post_indx'] post_indx = post_indx.reshape(-1, ) data = Data(x=x, edge_index=edge_idx) if PARAMS['CUDA']: device = 'cuda' if torch.cuda.is_available() else 'cpu' else: device = 'cpu' model = Node2Vec(data.edge_index, embedding_dim=PARAMS['EMBEDDING_DIM'], walk_length=PARAMS['WALK_LENGTH'], context_size=PARAMS['CONTEXT_SIZE'], walks_per_node=PARAMS['WALKS_PER_NODE'], p=PARAMS['P'], q=PARAMS['Q'], sparse=True).to(device) loader = model.loader(batch_size=PARAMS['BATCH_SIZE'], shuffle=True, num_workers=8) optimizer = torch.optim.SparseAdam(model.parameters(), lr=PARAMS['LEARNING_RATE']) def train(): model.train() total_loss = 0 for pos_rw, neg_rw in tqdm(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(loader) print('number of nodes to be embedded {}'.format(len(post_indx))) print('Start Node2vec Embedding Process with Following Parameters:') print(PARAMS) losses = [] for epoch in range(1, PARAMS['NUM_EPOCH'] + 1): loss = train() losses.append(loss) print('Epoch: {:02d}, Node2vec Loss: {:.4f}'.format(epoch, loss)) model.eval() with torch.no_grad(): z = model() if not os.path.exists(os.path.join(fp, 'processed', 'node2vec')): os.makedirs(os.path.join(fp, 'processed', 'node2vec'), exist_ok=True) with open( osp.join(fp, 'processed', 'node2vec', PARAMS['EMBEDDING_NAME'] + 'log.json'), 'w') as f: json.dump({'loss': losses}, f) z = z.detach().cpu().numpy()[post_indx, :] np.save(osp.join(fp, 'processed', 'node2vec', PARAMS['EMBEDDING_NAME']), z) print('successfully saved embedding') return z
for actor in ast.literal_eval(actors[0]): actor_extension_matrix[row['item'] + 1, actor] = 1 for i in range(1, 4): flag = extended_dataset.loc[ extended_dataset['id'] == row['original item id'] - 1, 'flag' + str(i)].reset_index(drop=True) if not flag.empty: actor_extension_matrix[row['item'], i * -1] = flag[0] X = hstack([X, actor_extension_matrix]) X = X.transpose() # We retrieve the graph's edges and send both them and graph to device in the next two lines X = sparse_mx_to_torch_sparse_tensor(X).to(device) edge_idx, edge_attr = from_scipy_sparse_matrix(adj_mx) edge_idx = edge_idx.to(device) # these columns are not neeeded anymore train_set.drop(['original item id'], axis=1) test_set.drop(['original item id'], axis=1) val_set.drop(['original item id'], axis=1) # create training set train_dataset = PairData(train_set, sampler=sampler, adj_mx=adj_mx, is_training=True, context=args.context) print('=' * 50, '\n')
def train(self): global df if self.gpu == True: device_name = '/gpu:0' else: device_name = '/cpu:0' print('device name:', device_name) model = HGAT(self.batch_size).to(device) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) else: nn.init.uniform_(p) optimizer_hgat = optim.Adam(model.parameters(), lr=self.parameters['lr'], weight_decay=5e-4) inci_mat = np.load('nasdaq.npy') inci_sparse = sparse.coo_matrix(inci_mat) incidence_edge = utils.from_scipy_sparse_matrix(inci_sparse) hyp_input = incidence_edge[0].to(device) batch_offsets = np.arange(start=0, stop=self.valid_index, dtype=int) for i in range(self.epochs): t1 = time() np.random.shuffle(batch_offsets) tra_loss = 0.0 tra_reg_loss = 0.0 tra_rank_loss = 0.0 model.train() for j in tqdm( range(self.valid_index - self.parameters['seq'] - self.steps + 1)): emb_batch, mask_batch, price_batch, gt_batch = self.get_batch( batch_offsets[j]) optimizer_hgat.zero_grad() output = model( torch.FloatTensor(emb_batch).to(device), hyp_input) cur_loss, cur_reg_loss, cur_rank_loss, curr_rr_train = trr_loss_mse_rank( output.reshape((1026, 1)), torch.FloatTensor(price_batch).to(device), torch.FloatTensor(gt_batch).to(device), torch.FloatTensor(mask_batch).to(device), self.parameters['alpha'], self.batch_size) tra_loss += cur_loss.item() tra_reg_loss += cur_reg_loss.item() tra_rank_loss += cur_rank_loss.item() cur_loss.backward() optimizer_hgat.step() print( 'Train Loss:', tra_loss / (self.valid_index - self.parameters['seq'] - self.steps + 1), tra_reg_loss / (self.valid_index - self.parameters['seq'] - self.steps + 1), tra_rank_loss / (self.valid_index - self.parameters['seq'] - self.steps + 1)) with torch.no_grad(): # test on validation set cur_valid_pred = np.zeros( [len(self.tickers), self.test_index - self.valid_index], dtype=float) cur_valid_gt = np.zeros( [len(self.tickers), self.test_index - self.valid_index], dtype=float) cur_valid_mask = np.zeros( [len(self.tickers), self.test_index - self.valid_index], dtype=float) val_loss = 0.0 val_reg_loss = 0.0 val_rank_loss = 0.0 model.eval() for cur_offset in range( self.valid_index - self.parameters['seq'] - self.steps + 1, self.test_index - self.parameters['seq'] - self.steps + 1): emb_batch, mask_batch, price_batch, gt_batch = self.get_batch( cur_offset) output_val = model( torch.FloatTensor(emb_batch).to(device), hyp_input) cur_loss, cur_reg_loss, cur_rank_loss, cur_rr = trr_loss_mse_rank( output_val, torch.FloatTensor(price_batch).to(device), torch.FloatTensor(gt_batch).to(device), torch.FloatTensor(mask_batch).to(device), self.parameters['alpha'], self.batch_size) cur_rr = cur_rr.detach().cpu().numpy().reshape((1026, 1)) val_loss += cur_loss.detach().cpu().item() val_reg_loss += cur_reg_loss.detach().cpu().item() val_rank_loss += cur_rank_loss.detach().cpu().item() cur_valid_pred[:, cur_offset - (self.valid_index - self.parameters['seq'] - self.steps + 1)] = \ copy.copy(cur_rr[:, 0]) cur_valid_gt[:, cur_offset - (self.valid_index - self.parameters['seq'] - self.steps + 1)] = \ copy.copy(gt_batch[:, 0]) cur_valid_mask[:, cur_offset - (self.valid_index - self.parameters['seq'] - self.steps + 1)] = \ copy.copy(mask_batch[:, 0]) print('Valid MSE:', val_loss / (self.test_index - self.valid_index), val_reg_loss / (self.test_index - self.valid_index), val_rank_loss / (self.test_index - self.valid_index)) cur_valid_perf = evaluate(cur_valid_pred, cur_valid_gt, cur_valid_mask) print('\t Valid preformance:', cur_valid_perf) # test on testing set cur_test_pred = np.zeros( [len(self.tickers), self.trade_dates - self.test_index], dtype=float) cur_test_gt = np.zeros( [len(self.tickers), self.trade_dates - self.test_index], dtype=float) cur_test_mask = np.zeros( [len(self.tickers), self.trade_dates - self.test_index], dtype=float) test_loss = 0.0 test_reg_loss = 0.0 test_rank_loss = 0.0 model.eval() for cur_offset in range( self.test_index - self.parameters['seq'] - self.steps + 1, self.trade_dates - self.parameters['seq'] - self.steps + 1): emb_batch, mask_batch, price_batch, gt_batch = self.get_batch( cur_offset) output_test = model( torch.FloatTensor(emb_batch).to(device), hyp_input) cur_loss, cur_reg_loss, cur_rank_loss, cur_rr = trr_loss_mse_rank( output_test, torch.FloatTensor(price_batch).to(device), torch.FloatTensor(gt_batch).to(device), torch.FloatTensor(mask_batch).to(device), self.parameters['alpha'], self.batch_size) cur_rr = cur_rr.detach().cpu().numpy().reshape((1026, 1)) test_loss += cur_loss.detach().cpu().item() test_reg_loss += cur_reg_loss.detach().cpu().item() test_rank_loss += cur_rank_loss.detach().cpu().item() cur_test_pred[:, cur_offset - (self.test_index - self.parameters['seq'] - self.steps + 1)] = \ copy.copy(cur_rr[:, 0]) cur_test_gt[:, cur_offset - (self.test_index - self.parameters['seq'] - self.steps + 1)] = \ copy.copy(gt_batch[:, 0]) cur_test_mask[:, cur_offset - (self.test_index - self.parameters['seq'] - self.steps + 1)] = \ copy.copy(mask_batch[:, 0]) print('Test MSE:', test_loss / (self.trade_dates - self.test_index), test_reg_loss / (self.trade_dates - self.test_index), test_rank_loss / (self.trade_dates - self.test_index)) cur_test_perf = evaluate(cur_test_pred, cur_test_gt, cur_test_mask) print('\t Test performance:', cur_test_perf)
def perturb_edges(data, name, remove_pct, add_pct, hidden_channels=16, epochs=400): if remove_pct == 0 and add_pct == 0: return try: cached = pickle.load( open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'rb')) print(f'Use cached edge augmentation for dataset {name}') if data.setting == 'inductive': data.train_edge_index = cached else: data.edge_index = cached return except FileNotFoundError: try: A_pred, adj_orig = pickle.load( open(f'{ROOT}/cache/edge/{name}.pt', 'rb')) A = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) data.edge_index, _ = from_scipy_sparse_matrix(A) pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) return except FileNotFoundError: print( f'cache/edge/{name}_{remove_pct}_{add_pct}.pt not found! Regenerating it now' ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if data.setting == 'inductive': train_data = Data(x=data.train_x, ori_x=data.ori_x, edge_index=data.train_edge_index, y=data.train_y) else: train_data = deepcopy(data) edge_index = deepcopy(train_data.edge_index) train_data = train_test_split_edges(train_data, val_ratio=0.1, test_ratio=0) num_features = train_data.ori_x.shape[1] model = GAE(GCNEncoder(num_features, hidden_channels)) model = model.to(device) x = train_data.ori_x.to(device) train_pos_edge_index = train_data.train_pos_edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) best_val_auc = 0 best_z = None for epoch in range(1, epochs + 1): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) loss.backward() optimizer.step() model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) auc, ap = model.test(z, train_data.val_pos_edge_index, train_data.val_neg_edge_index) print('Val | Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format( epoch, auc, ap)) if auc > best_val_auc: best_val_auc = auc best_z = deepcopy(z) A_pred = torch.sigmoid(torch.mm(z, z.T)).cpu().numpy() adj_orig = to_scipy_sparse_matrix(edge_index).asformat('csr') adj_pred = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) if data.setting == 'inductive': data.train_edge_index, _ = from_scipy_sparse_matrix(adj_pred) else: data.edge_index, _ = from_scipy_sparse_matrix(adj_pred) pickle.dump((A_pred, adj_orig), open(f'{ROOT}/cache/edge/{name}.pt', 'wb')) if data.setting == 'inductive': pickle.dump( data.train_edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) else: pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))
def process(self): raw_dir = '/'.join([self.root, 'raw']) fs_subject_dir = osp.join(raw_dir, self.raw_file_names[0], 'freesurfer/5.1') # copy .annot to SUBJECT_DIR os.system('cp {} {}'.format(osp.join(raw_dir, self.raw_file_names[2]), fs_subject_dir)) os.system('cp {} {}'.format(osp.join(raw_dir, self.raw_file_names[3]), fs_subject_dir)) # process fs outputs shell_script_path = osp.join(os.getcwd(), 'fs_preproc.sh') process_fs_output(fs_subject_dir, shell_script_path) if self.atlas == 'HCPMMP1': fs_subject_dir = osp.join(fs_subject_dir, 'all_output') # `all_output` hardcoded # phenotypic for label s3_pheno_path = '/'.join([self.root, 'raw', self.raw_file_names[1]]) pheno_df = pd.read_csv(s3_pheno_path) # load atlas for fmri if self.atlas == 'HCPMMP1': # load and transform atlas atlas_nii_file = '/'.join( [self.root, 'raw', self.raw_file_names[4]]) atlas_img = image.load_img(atlas_nii_file) # split left and right hemisphere atlas_img.get_data()[atlas_img.get_data()[:int(atlas_img.shape[0] / 2 + 1), :, :].nonzero()] += \ atlas_img.get_data().max() num_nodes = 360 elif self.atlas == 'destrieux': from nilearn.datasets import fetch_atlas_destrieux_2009 atlas_nii_file = fetch_atlas_destrieux_2009().maps atlas_img = image.load_img(atlas_nii_file) num_nodes = 148 # read FreeSurfer output anatomical_features_dict = read_fs_stats(fs_subject_dir, self.atlas) # make subject_ids subject_ids = list(anatomical_features_dict.keys()) if self.site == 'ALL': import urllib all_subject_ids_path = 'https://raw.githubusercontent.com/parisots/population-gcn/master/subject_IDs.txt' response = urllib.request.urlopen(all_subject_ids_path) all_subject_ids = [ s.decode() for s in response.read().splitlines() ] subject_ids = [s for s in subject_ids if s[-5:] in all_subject_ids] assert len(subject_ids) == 871 # process the data data_list = [] failed_subject_list = [] for subject in tqdm(subject_ids, desc='subject_list'): try: y, sex, iq, site_id, subject_id = label_from_pheno( pheno_df, subject) # read anatomical features from dict lh_df, rh_df = anatomical_features_dict[subject] node_features = torch.from_numpy( np.concatenate([ lh_df[self.anatomical_feature_names].values, rh_df[self.anatomical_feature_names].values ])).float() if node_features.shape[0] != num_nodes: # check missing nodes, for 'destrieux' continue # path for preprocessed functional MRI fmri_nii_file = '/'.join([ self.root, 'raw', 'Outputs', self.pipeline, self.strategy, self.derivative, "{}_func_preproc.nii.gz".format(subject) ]) # nilearn masker and corr masker = NiftiLabelsMasker(labels_img=atlas_img, standardize=True, memory='nilearn_cache', verbose=5) correlation_measure = ConnectivityMeasure(kind='correlation') time_series = masker.fit_transform(fmri_nii_file) # 0 in regions # for i in range(num_nodes): # assert np.any(time_series[:, i]) # handle broken file in ABIDE preprocessed filternoglobal if subject == 'UM_1_0050302': time_series = nilearn.signal.clean(time_series.transpose(), low_pass=0.1, high_pass=0.01, t_r=2).transpose() if subject == 'Leuven_2_0050730': time_series = nilearn.signal.clean(time_series.transpose(), low_pass=0.1, high_pass=0.01, t_r=1.6667).transpose() # optional data augmentation if self.resample_ts: time_series_list = resample_temporal(time_series) elif self.dfc_resample: time_series_list = window_slpit_ts(time_series, window=30, step=5) else: time_series_list = [time_series] # correlation form time series connectivity_matrix_list = correlation_measure.fit_transform( time_series_list) for adj, time_series in zip(connectivity_matrix_list, time_series_list): time_series, raw_adj = torch.tensor( time_series), torch.tensor(adj) adj_statistics = get_adj_statistics(adj) padded_time_series = torch.zeros(30, 360) padded_time_series[:time_series.shape[0]] = time_series padded_time_series = padded_time_series.t() # transform adj # np.fill_diagonal(adj, 0) # remove self-loop for transform adj = self.transform_edge( adj) if self.transform_edge is not None else adj # set a threshold for adj if self.threshold is not None: adj = top_k_percent_adj(adj, self.threshold) assert check_strongly_connected(adj) # create torch_geometric Data edge_index, edge_weight = from_scipy_sparse_matrix( coo_matrix(adj)) data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weight, y=y) # additional node feature data.adj_statistics = adj_statistics data.time_series, data.raw_adj = padded_time_series, raw_adj # phenotypic data data.sex, data.iq, data.site_id, data.subject_id = sex, iq, site_id, subject_id data.num_nodes = data.x.shape[0] data_list.append(data) except Exception as e: print(e) logging.warning("failed at subject {}".format(subject)) failed_subject_list.append(subject) # with open('failed_fmri_subject_list', 'w') as f: # f.write("\n".join(failed_subject_list)) print("failed_subject_list", failed_subject_list) if self.site == 'ALL': data_list = repermute(data_list, all_subject_ids) self.data, self.slices = self.collate(data_list) torch.save((self.data, self.slices), self.processed_paths[0])