def gen_features(self, raw_data): """tbd""" smiles = raw_data['smiles'] mol = AllChem.MolFromSmiles(smiles) if mol is None: return None data = mol_to_graph_data(mol) data['smiles'] = smiles return data
def gen_features(self, raw_data): """tbd""" smiles, label = raw_data['smiles'], raw_data['label'] mol = AllChem.MolFromSmiles(smiles) if mol is None: return None data = mol_to_graph_data(mol) data['label'] = label.reshape([-1]) data['smiles'] = smiles return data
def gen_features(self, raw_data): """Convert smiles to graph data. Returns: data(dict): a dict of numpy ndarray consists of graph features. """ smiles = raw_data['smiles'] mol = AllChem.MolFromSmiles(smiles) if mol is None: return None data = mol_to_graph_data(mol) data['smiles'] = smiles return data
def test_gen_features(self): raw_data_list = [ { 'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1' }, { 'smiles': 'CCCCCCCCCCOCC(O)CN' }, ] smiles = raw_data_list[0]['smiles'] mol = AllChem.MolFromSmiles(smiles) data3 = mol_to_graph_data(mol) self.assertTrue(data3)
def gen_features(self, raw_data): """tbd""" smiles = raw_data['smiles'] mol = AllChem.MolFromSmiles(smiles) if mol is None: return None data = mol_to_graph_data(mol) new_data = {} transformed = transform_contextpred(data, self.k, self.l1, self.l2) if transformed is None: return None new_data['transformed'] = transformed new_data['smiles'] = smiles return new_data
def __call__(self, raw_data): """ Gen features according to raw data and return a single graph data. Args: raw_data: It contains smiles and label,we convert smiles to mol by rdkit,then convert mol to graph data. Returns: data: It contains reshape label and smiles. """ smiles = raw_data['smiles'] mol = AllChem.MolFromSmiles(smiles) if mol is None: return None data = mol_to_graph_data(mol) return data
def load_mutag_dataset(data_path): """Load mutag dataset, process the raw dataset to graph data. """ smiles_path = os.path.join(data_path, 'mutag_188_data.can') labels_path = os.path.join(data_path, 'mutag_188_target.txt') smiles_list = pd.read_csv(smiles_path, sep=' ', header=None)[0] labels = pd.read_csv(labels_path, header=None)[0].replace(-1, 0).values data_list, data_smiles_list = [], [] for i in range(len(smiles_list)): s = smiles_list[i] rdkit_mol = AllChem.MolFromSmiles(s) if not rdkit_mol is None: # ignore invalid mol objects data = mol_to_graph_data(rdkit_mol) data['label'] = labels[i].reshape([-1]) data_list.append(data) data_smiles_list.append(smiles_list[i]) return data_list, data_smiles_list
def gen_features(self, raw_data): """ Gen features according to raw data and return a single graph data. Args: raw_data: It contains smiles and label,we convert smiles to mol by rdkit,then convert mol to graph data. Returns: data: It contains reshape label and smiles. """ smiles = raw_data['smiles'] mol = AllChem.MolFromSmiles(smiles) if mol is None: return None data = mol_to_graph_data(mol) if not self.is_inference: label = raw_data['label'] data['label'] = label.reshape([-1]) data['smiles'] = smiles return data
def preprocess_dataset(name): """ Preprocess raw datasets. Args: name (str): name of the dataset. """ data_dir = os.path.join('data', name, 'raw') if not os.path.exists(data_dir): print('Ignore MUTAG dataset. Cannot find the corresponding folder: %s.' % data_dir) return can, txt = Datasets[name] smiles_path = os.path.join(data_dir, can) labels_path = os.path.join(data_dir, txt) smiles_list = pd.read_csv(smiles_path, sep=' ', header=None)[0] labels = pd.read_csv(labels_path, header=None)[0].replace(-1, 0).values data_list, data_smiles_list = [], [] for i in range(len(smiles_list)): s = smiles_list[i] mol = AllChem.MolFromSmiles(s) if mol is not None: data = mol_to_graph_data(mol) data['label'] = labels[i].reshape([-1]) data_list.append(data) data_smiles_list.append(smiles_list[i]) processed_dir = os.path.join('data', name, 'processed') if not os.path.exists(processed_dir): os.makedirs(processed_dir) with open(os.path.join(processed_dir, 'smiles.txt'), 'w') as f: for smiles in smiles_list: f.write('%s\n' % smiles) save_data_list_to_npz( data_list, os.path.join(processed_dir, 'data.npz'))
def load_ptc_mr_dataset(data_path): """Load PTC-MR dataset. """ raw_dir = join(self.root, self.dataset_name, 'raw') smiles_path = os.path.join(data_path, 'ptc_MR_data.can') labels_path = os.path.join(data_path, 'ptc_MR_target.txt') if exists(smiles_path) and exists(labels_path): # manually download seperated SMILES and label smiles_list = pd.read_csv(smiles_path, sep=' ', header=None)[0] labels = pd.read_csv(labels_path, header=None)[0].replace(-1, 0).values data_list, data_smiles_list = [], [] for i in range(len(smiles_list)): s = smiles_list[i] rdkit_mol = AllChem.MolFromSmiles(s) if not rdkit_mol is None: # ignore invalid mol objects data = mol_to_graph_data(rdkit_mol) data['label'] = labels[i].reshape([-1]) data_list.append(data) data_smiles_list.append(smiles_list[i]) return data_list, data_smiles_list
def main(): """Entry for data preprocessing.""" tokenizer = ProteinTokenizer() for dataset in ['davis', 'kiba']: data_dir = os.path.join(args.dataset_root, dataset) if not os.path.exists(data_dir): print('Cannot find {}'.format(data_dir)) continue train_fold = json.load( open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt'))) train_fold = [ee for e in train_fold for ee in e] # flatten test_fold = json.load( open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt'))) ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')), object_pairs_hook=OrderedDict) proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')), object_pairs_hook=OrderedDict) # Use encoding 'latin1' to load py2 pkl from py3 # pylint: disable=E1123 affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'), encoding='latin1') smiles_lst, protein_lst = [], [] for k in ligands.keys(): smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]), isomericSmiles=True) smiles_lst.append(smiles) for k in proteins.keys(): protein_lst.append(proteins[k]) if dataset == 'davis': # Kd data affinity = [-np.log10(y / 1e9) for y in affinity] affinity = np.asarray(affinity) # pylint: disable=E1123 os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True) for split in ['train', 'test']: print('processing {} set of {}'.format(split, dataset)) split_dir = os.path.join(data_dir, 'processed', split) # pylint: disable=E1123 os.makedirs(split_dir, exist_ok=True) fold = train_fold if split == 'train' else test_fold rows, cols = np.where(np.isnan(affinity) == False) rows, cols = rows[fold], cols[fold] data_lst = [] for idx in range(len(rows)): mol = AllChem.MolFromSmiles(smiles_lst[rows[idx]]) mol_graph = mol_to_graph_data(mol) data = {k: v for k, v in mol_graph.items()} seqs = [] for seq in protein_lst[cols[idx]].split('\x01'): seqs.extend(tokenizer.gen_token_ids(seq)) data['protein_token_ids'] = np.array(seqs) af = affinity[rows[idx], cols[idx]] if dataset == 'davis': data['Log10_Kd'] = np.array([af]) elif dataset == 'kiba': data['KIBA'] = np.array([af]) data_lst.append(data) random.shuffle(data_lst) npz = os.path.join(split_dir, '{}_{}.npz'.format(dataset, split)) save_data_list_to_npz(data_lst, npz) print('==============================') print('dataset:', dataset) print('train_fold:', len(train_fold)) print('test_fold:', len(test_fold)) print('unique drugs:', len(set(smiles_lst))) print('unique proteins:', len(set(protein_lst)))
def test_mol_to_graph_data(self, add_self_loop=True): smiles = 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1' mol = AllChem.MolFromSmiles(smiles) data = mol_to_graph_data(mol) self.assertTrue(data)
def load_davis_dataset(data_path, featurizer): """tbd""" tokenizer = ProteinTokenizer() for dataset in ['davis']: data_dir = os.path.join(data_path, dataset) if not os.path.exists(data_dir): print('Cannot find {}'.format(data_dir)) continue train_fold = json.load( open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt'))) train_fold = [ee for e in train_fold for ee in e] # flatten test_fold = json.load( open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt'))) ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')), object_pairs_hook=OrderedDict) proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')), object_pairs_hook=OrderedDict) # Use encoding 'latin1' to load py2 pkl from py3 # pylint: disable=E1123 affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'), encoding='latin1') smiles_lst, protein_lst = [], [] # print("keys :",ligands.keys()) for k in ligands.keys(): smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]), isomericSmiles=True) smiles_lst.append(smiles) for k in proteins.keys(): protein_lst.append(proteins[k]) if dataset == 'davis': # Kd data affinity = [-np.log10(y / 1e9) for y in affinity] affinity = np.asarray(affinity) # pylint: disable=E1123 os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True) train_test_dataset = [] for split in ['train', 'test']: print('processing {} set of {}'.format(split, dataset)) split_dir = os.path.join(data_dir, 'processed', split) # pylint: disable=E1123 os.makedirs(split_dir, exist_ok=True) fold = train_fold if split == 'train' else test_fold rows, cols = np.where(np.isnan(affinity) == False) rows, cols = rows[fold], cols[fold] # changed from npz files to 1 data_lst = [[] for _ in range(1)] for idx in range(len(rows)): # mol_graph = smiles_to_graph_data(smiles_lst[rows[idx]]) # if idx >= 1000: # break mol_graph = mol_to_graph_data( Chem.MolFromSmiles(smiles_lst[rows[idx]])) data = {k: v for k, v in mol_graph.items()} seqs = [] for seq in protein_lst[cols[idx]].split('\x01'): seqs.extend(tokenizer.gen_token_ids(seq)) data['protein_token_ids'] = np.array(seqs) af = affinity[rows[idx], cols[idx]] if dataset == 'davis': data['Log10_Kd'] = np.array([af]) elif dataset == 'kiba': data['KIBA'] = np.array([af]) data_lst[idx % 1].append(data) random.shuffle(data_lst) # how to deal with the distributed feature ? # Now return the whone dataset # print("data lst:",data_lst) train_test_dataset.append(InMemoryDataset(data_lst[0])) print('==============================') print('dataset:', dataset) print('train_fold:', len(train_fold)) print('test_fold:', len(test_fold)) print('unique drugs:', len(set(smiles_lst))) print('unique proteins:', len(set(protein_lst))) return train_test_dataset