def __init__(self, name: str, split="train", path="./data", mode="cnn_cnn", y_log=True, drug_transform=None, protein_transform=None): self.data = DTI(name=name, path=path) self.mode = mode.lower() if y_log: self.data.convert_to_log() self.data = self.data.get_split()[split] self.drug_transform = drug_transform self.protein_transform = protein_transform
class BindingDBDataset(data.Dataset): """ A custom dataset for loading and processing original TDC data, which is used as input data in DeepDTA model. Args: name (str): TDC dataset name. split (str): Data split type (train, valid or test). path (str): dataset download/local load path (default: "./data") mode (str): encoding mode (default: cnn_cnn) drug_transform: Transform operation (default: None) protein_transform: Transform operation (default: None) y_log (bool): Whether convert y values to log space. (default: True) """ def __init__( self, name: str, split="train", path="./data", mode="cnn_cnn", y_log=True, drug_transform=None, protein_transform=None, ): self.data = DTI(name=name, path=path) self.mode = mode.lower() if y_log: self.data.convert_to_log() self.data = self.data.get_split()[split] self.drug_transform = drug_transform self.protein_transform = protein_transform def __len__(self): return len(self.data) def __getitem__(self, idx): drug, protein, label = self.data["Drug"][idx], self.data["Target"][ idx], self.data["Y"][idx] mode_drug, mode_protein = self.mode.split("_") if mode_drug == "cnn": drug = torch.LongTensor(integer_label_smiles(drug)) if mode_protein == "cnn": protein = torch.LongTensor(integer_label_protein(protein)) label = torch.Tensor([label]) if self.drug_transform is not None: self.drug_transform(drug) if self.protein_transform is not None: self.protein_transform(protein) return drug, protein, label
def main(): # For now hard coding the values for all methods below, change to read it using config file print("Start..") args = arg_parse() # ---- set configs, logger and device ---- cfg = get_cfg_defaults() cfg.merge_from_file(args.cfg) cfg.freeze() device = "cuda" if torch.cuda.is_available() else "cpu" # Fetch the bindingDB dataset based on the name defined in config file bindingdb_dataset = DTI(name=cfg.DATASET.NAME) # Apply cluster based on the 3 types of cluster # can define cluster_type -> k_means, agglomerative, dbscan # bindingdb_dataset = apply_clustering(bindingdb_dataset, num_of_clusters=cfg.SOLVER.NUM_OF_CLUSTERS, # cluster_type=cfg.MODEL.CLUSTER_TYPE) bindingdb_dataset_cluster = apply_clustering( bindingdb_dataset, num_of_clusters=cfg.SOLVER.NUM_OF_CLUSTERS, cluster_type=cfg.MODEL.CLUSTER_TYPE) # Split the data based on the clusters formed by specifying the split in fraction train_dataset, val_dataset, test_dataset = get_split_by_clusters( bindingdb_dataset_cluster, num_of_clusters=cfg.SOLVER.NUM_OF_CLUSTERS) train_dataset = DTADataset(ds=train_dataset) val_dataset = DTADataset(ds=val_dataset) test_dataset = DTADataset(ds=test_dataset) train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=cfg.SOLVER.TRAIN_BATCH_SIZE) val_loader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=cfg.SOLVER.TEST_BATCH_SIZE) test_loader = DataLoader(dataset=test_dataset, shuffle=True, batch_size=cfg.SOLVER.TEST_BATCH_SIZE) # ---- set model ---- model = get_model(cfg) # ---- training and evaluation ---- gpus = 1 if device == "cuda" else 0 checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode="min") trainer = pl.Trainer(max_epochs=cfg.SOLVER.MAX_EPOCHS, gpus=gpus, callbacks=[checkpoint_callback]) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader) trainer.test(test_dataloaders=test_loader) print("Done!!!")
def test_to_graph(self): from tdc.multi_pred import DTI data = DTI(name='DAVIS') data.to_graph(threshold=30, format='edge_list', split=True, frac=[0.7, 0.1, 0.2], seed=42, order='descending') # output: {'edge_list': array of shape (X, 2), 'neg_edges': array of shape (X, 2), 'split': {'train': df, 'valid': df, 'test': df}} data.to_graph(threshold=30, format='dgl', split=True, frac=[0.7, 0.1, 0.2], seed=42, order='descending') # output: {'dgl_graph': the DGL graph object, 'index_to_entities': a dict map from ID in the data to node ID in the DGL object, 'split': {'train': df, 'valid': df, 'test': df}} data.to_graph(threshold=30, format='pyg', split=True, frac=[0.7, 0.1, 0.2], seed=42, order='descending')
def test_multi_pred(self): from tdc.multi_pred import DTI data = DTI(name='DAVIS') split = data.get_split(method='cold_split', column_name='Drug')
def test_DTI(self): from tdc.multi_pred import DTI data = DTI(name='DAVIS') split = data.get_split()
from tdc.multi_pred import DTI import numpy as np from rdkit.Chem import AllChem from rdkit import Chem from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.cluster import DBSCAN from matplotlib import pyplot as plt import math data = DTI(name = 'BindingDB_Kd') # data = DTI(name = 'DAVIS') # data = DTI(name = 'KIBA') def drugTarget2vec(data): data = data.get_data() data_selected = data[['Drug_ID','Drug','Target_ID','Target','Y']] data_selected['Drug2vector'] = '' data_selected['TargetId'] = '' # use rdkit calculate ECFPs for ind, drug in enumerate(data_selected['Drug']): mol = Chem.MolFromSmiles(drug) Morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2) #Explicitbitvects, which record whether or not a bit exists, are usually faster than SparseBitVects, but take up more memory, similar to fixed-length bit strings. intmap = map(int, Morgan_fp.ToBitString()) data_selected['Drug2vector'][ind] = np.array(list(intmap)) # create a dict to record the map relation of Target and TargetID
# import modules import numpy as np from tdc.multi_pred import DTI from rdkit import Chem from rdkit.Chem import AllChem from rdkit import DataStructs # load in data data_Kd = DTI(name='BindingDB_Kd') data_Kd.convert_to_log(form='binding') split = data_Kd.get_split(method='random', seed=42, frac=[0.6, 0.05, 0.35]) train = split['train'] test = split['test'] print('Data loaded') train = train.dropna() ID_to_Drug = dict(enumerate(list(dict.fromkeys(train['Drug'])))) Drug_to_ID = dict((v, k) for k, v in ID_to_Drug.items()) print('Drug dictionaries completed') num_drugs = len(Drug_to_ID.keys()) drug_sim = np.zeros((num_drugs, num_drugs)) for i in range(num_drugs): if i % 1000 == 0: print('\n500 drug similarities calculated') drug1 = ID_to_Drug[i] m1 = Chem.MolFromSmiles(drug1) fp1 = AllChem.GetMorganFingerprint(m1, 2) for j in range(num_drugs):
def test_convert_to_log(self): from tdc.multi_pred import DTI data = DTI(name='DAVIS') data.convert_to_log()
def test_binarize(self): from tdc.multi_pred import DTI data = DTI(name='DAVIS') data.binarize(threshold=30, order='descending')
# evaluators from tdc import Evaluator evaluator = Evaluator(name='ROC-AUC') print(evaluator([0, 1], [0.5, 0.6])) # Processing Helpers from tdc.single_pred import ADME data = ADME(name='Caco2_Wang') data.label_distribution() from tdc.multi_pred import DTI data = DTI(name='DAVIS') data.binarize(threshold=30, order='descending') from tdc.multi_pred import DTI data = DTI(name='DAVIS') data.convert_to_log() from tdc.multi_pred import DDI from tdc.utils import get_label_map data = DDI(name='DrugBank') split = data.get_split() get_label_map(name='DrugBank', task='DDI') from tdc.multi_pred import GDA data = GDA(name='DisGeNET') data.print_stats() from tdc.single_pred import HTS
def time_split(self): from tdc.multi_pred import DTI data = DTI(name='BindingDB_Patent') split = data.get_split(method='time', time_column='Year')