def one_hot_process(self): input_path = self.raw_paths[0] input_df = pd.read_csv(input_path, sep=',', dtype='str') smile_list = list(input_df[self.smile_col]) if self.available_prop: prop_list = list(input_df[self.prop_name]) self.all_smiles = smile_list data_list = [] for i in range(len(smile_list)): smile = smile_list[i] mol = Chem.MolFromSmiles(smile) Chem.Kekulize(mol) num_atom = mol.GetNumAtoms() if num_atom > self.num_max_node: continue else: # atoms atom_array = np.zeros((len(self.atom_list), self.num_max_node), dtype=np.int32) if self.one_shot: virtual_node = np.ones((1, self.num_max_node), dtype=np.int32) atom_idx = 0 for atom in mol.GetAtoms(): atom_feature = atom.GetAtomicNum() # print('self.atom_list','atom_feature', 'index') # print(self.atom_list, atom_feature, self.atom_list.index(atom_feature)) atom_array[self.atom_list.index(atom_feature), atom_idx] = 1 if self.one_shot: virtual_node[0, atom_idx] = 0 atom_idx += 1 if self.one_shot: x = torch.tensor(np.concatenate((atom_array, virtual_node), axis=0)) else: x = torch.tensor(atom_array) # bonds adj_array = np.zeros([4, self.num_max_node, self.num_max_node], dtype=np.float32) for bond in mol.GetBonds(): bond_type = bond.GetBondType() ch = bond_type_to_int[bond_type] i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() adj_array[ch, i, j] = 1.0 adj_array[ch, j, i] = 1.0 adj_array[-1, :, :] = 1 - np.sum(adj_array, axis=0) data = Data(x=x) data.adj = torch.tensor(adj_array) data.num_atom = num_atom if self.available_prop: data.y = torch.tensor([float(prop_list[i])]) data_list.append(data) data, slices = self.collate(data_list) return data, slices
def get(self, idx): #data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(idx)) #return data if idx in self.cache: return self.cache[idx] d = self.input_ds[idx] adj = d['adj'] vect_feat = d['vect_feat'] input_mask = d['input_mask'] pred_mask = d['pred_mask'] vals = d['vals'] coords = d['coords'] #print(adj.shape) have_edge, _ = adj.max(dim=0) #print("have_edge.shape=", have_edge.shape) edges = np.argwhere(have_edge.numpy()) #print("adj.shape=", adj.shape) #print("edges.shape=", edges.shape) edge_attr = [adj[:, i, j] for i, j in edges] edge_type = torch.Tensor( [np.argwhere(adj[:, i, j])[0] for i, j in edges]).long() edge_attr = torch.stack(edge_attr) #print("edge_attr.shape=", edge_attr.shape) edge_index_tensor = torch.Tensor(edges.T).long() #print("edge_index_tensor.shape=", edge_index_tensor.shape, "edge_index_tensor.dtype=", edge_index_tensor.dtype) y = torch.Tensor(vals) #.reshape(1, -1, 1) # FIXME maybe we need to select non-padded nodes here? data = GeomData(x=torch.Tensor(vect_feat), edge_index=edge_index_tensor, edge_type=edge_type, input_idx=idx, pos=coords, edge_attr=edge_attr, y=y) data.pred_mask = torch.Tensor(pred_mask) # .reshape(1, -1, 1) data.input_mask = torch.Tensor(input_mask) data.adj = torch.Tensor(adj) # print("pred_mask.shape=", pred_mask.shape, # " returning data.pred_mask.shape=", data.pred_mask.shape, # "y.shape=", y.shape) self.cache[idx] = data return data
def create_and_save_data(scale, fs_subjects_dir_path, atlas_sheet_path, fsl_subjects_dir_path, atlas_dir_path, correlation_measure, r, tmp_dir_path, subject): """ create torch_geometric :obj:`Data` for one subject with re-sampling save the list of :obj:`Data` to tmp_dir_path via pickle :param scale: :param fs_subjects_dir_path: :param atlas_sheet_path: :param fsl_subjects_dir_path: :param atlas_dir_path: :param correlation_measure: :param r: N choose r, for re-sampling :param tmp_dir_path: :param subject: :return: """ data_list = [] node_attr_array = to_node_attr_array(subject, fs_subjects_dir_path, atlas_sheet_path, scale) time_series = to_time_series(subject, fsl_subjects_dir_path, atlas_dir_path, scale) # # TODO: redundant data argumentation ? # combo_list = resample_mm_N_choose_n(time_series) # shuffle(combo_list) # for combo in combo_list: # 3 channels in 1 comb (RESTBLOCK IPBLOCK UNKNOWN) # corr_list = [correlation_measure.fit_transform([cm])[0] for cm in combo] # # convert correlation to distance between [0, 1] # # corr_list = [1 - np.sqrt((1 - corr) / 2) for corr in corr_list] # # corr_list = [remove_least_k_percent(np.abs(corr), k=0.4) for corr in corr_list] # abs # all_corr = np.stack(corr_list, axis=-1) # # # create torch_geometric Data # G = nx.from_numpy_array(np.ones_like(corr_list[0])) # A = nx.to_scipy_sparse_matrix(G) # adj = A.tocoo() # edge_index = np.stack([adj.row, adj.col]) # edge_attr = np.ones((len(adj.row), len(corr_list))) # add edge_attr later # # for i in range(len(adj.row)): # # edge_attr[i] = all_corr[adj.row[i], adj.col[i]] # # data = Data(x=torch.tensor(node_attr_array, dtype=torch.float), # edge_index=torch.tensor(edge_index, dtype=torch.long), # edge_attr=torch.tensor(edge_attr, dtype=torch.float), # y=torch.tensor([0]) if subject.startswith('2') else torch.tensor([1])) # data.adj = torch.tensor(all_corr, dtype=torch.float) # data.ts = torch.tensor(time_series) # data_list.append(data) corr = correlation_measure.fit_transform([time_series])[0] G = nx.from_numpy_array(np.ones_like(corr)) A = nx.to_scipy_sparse_matrix(G) adj = A.tocoo() edge_index = np.stack([adj.row, adj.col]) edge_attr = np.ones((len(adj.row), 1)) # add edge_attr later data = Data( x=torch.tensor(node_attr_array, dtype=torch.float), edge_index=torch.tensor(edge_index, dtype=torch.long), edge_attr=torch.tensor(edge_attr, dtype=torch.float), y=torch.tensor([0]) if subject.startswith('2') else torch.tensor([1])) data.adj = torch.tensor(corr, dtype=torch.float) data.ts = torch.tensor(time_series) data_list.append(data) subject_tmp_file_path = osp.join(tmp_dir_path, '{}.pickle'.format(subject)) print("Saving {} samples on subject {}".format(len(data_list), subject)) with open(subject_tmp_file_path, 'wb') as pfile: pickle.dump(data_list, pfile, protocol=pickle.HIGHEST_PROTOCOL)