def test_PubChemDataSet_creation(self): """ testing PubChemDataSet creation """ # some cids from PubChem AID 1 active_compounds = [11969872, 390525, 394646] inactive_compounds = [1018, 4775, 219294] ds = PubChemDataSet(1) df = ds.get_compounds() assert all(df.loc[active_compounds, 'Activity'] == 1) assert all(df.loc[inactive_compounds, 'Activity'] == 0)
def test_PubChemDataSet_clean_load(self): """ testing a clean loaded dataset """ df = PubChemDataSet(1).clean_load() assert len((df.Activity[df.Activity == 1])) == len( (df.Activity[df.Activity == 0])) assert None not in [ Chem.MolToSmiles(mol) if mol else None for mol in df.rdkit ]
def build_models(aid, sub_directory): best_scores = {} aid_sub_directory = os.path.join(sub_directory, str(aid)) # if its already been modeled move on if os.path.exists(aid_sub_directory): return else: os.mkdir(aid_sub_directory) try: ds = PubChemDataSet(aid).clean_load() ds.to_csv('{}/training.csv'.format(aid_sub_directory)) y = ds.Activity X = PubChemDataSetDescriptors(ds).load_ECFP6() print("=======building model for aid {0}=======".format(aid)) print("======={0} compounds: {1} active, {2} inactive=======".format( y.shape[0], (y == 1).sum(), (y == 0).sum())) except: print("error on aid {0}".format(aid)) return for name, clf in SKLearnModels.CLASSIFIERS: pipe = Pipeline([(name, clf)]) print("=======5-fold CV on {0}=======".format(name)) parameters = SKLearnModels.PARAMETERS[name] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=1, verbose=0) cv_search.fit(X.values, y.values) print("================================") print("The best parameters for {0} are :\n{1}".format( name, cv_search.best_params_)) print("The best score is {0}".format(cv_search.best_score_)) best_scores[name] = cv_search.best_score_ joblib.dump(cv_search.best_estimator_, '{}/{}.pkl'.format(aid_sub_directory, name)) with open('{}/results.csv'.format(aid_sub_directory), 'w') as results_file: for model, score in best_scores.items(): results_file.write(model + ',' + str(score) + '\n')
def test_PubChemDataSet_load(self): """ testing PubChemDataSet load """ # some smiles from PubChem AID 1 smiles = ['COC1=CC(=C2C(=C1)OC(=CC2=O)C3(C=CC(=O)C=C3)O)O', 'C1=CC=C(C=C1)CCCC(=O)O'] df = PubChemDataSet(1).load() for smi in smiles: print(smi in df.SMILES.tolist()) assert smi in df.SMILES.tolist()
def setup(self): """ sets up class with some dummy data """ self.clfs = [clf[0] for clf in SKLearnModels.CLASSIFIERS] self.ds = PubChemDataSet(1224861).load() mols = [Chem.MolFromSmiles(smi) for smi in self.ds.SMILES] calc = MoleculeDescriptors.MolecularDescriptorCalculator( [desc[0] for desc in Descriptors.descList]) self.X = pd.DataFrame( [list(calc.CalcDescriptors(mol)) for mol in mols], columns=list(calc.GetDescriptorNames()), index=self.ds.index)
def setup(self): self.ds = PubChemDataSet(1).clean_load() self.y = self.ds.Activity self.X = PubChemDataSetDescriptors(self.ds).load_rdkit() #print(self.X.shape, self.y.shape) self.y = self.y[self.X.notnull().all(1)] self.X = self.X[self.X.notnull().all(1)] #print(self.X.shape, self.y.shape) self.y = self.y[~np.isinf(self.X.values).any(1)] self.X = self.X[~np.isinf(self.X.values).any(1)]
def build_model(aid, model): try: ds = PubChemDataSet(aid).clean_load() y = ds.Activity X = PubChemDataSetDescriptors(ds).load_rdkit() # TODO: put this into a cleaner step # remove null values y = y[X.notnull().all(1)] X = X[X.notnull().all(1)] # TODO: put this into a cleaner step # remove null values y = y[~np.isinf(X.values).any(1)] X = X[~np.isinf(X.values).any(1)] print("=======building model for aid {0}=======".format(aid)) print("======={0} compounds: {1} active, {2} inactive=======".format( y.shape[0], (y == 1).sum(), (y == 0).sum())) except: raise Exception("error on aid {0}".format(aid)) pipe = Pipeline( list(SKLearnModels.PREPROCESS) + [(name, clf) for name, clf in SKLearnModels.CLASSIFIERS if name == model]) print("=======5-fold CV on {0}=======".format(model)) parameters = SKLearnModels.PARAMETERS[model] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=0) cv_search.fit(X.values, y.values) print("================================") print("The best parameters for {0} are :".format(model)) for param, val in cv_search.best_params_.items(): print("{}: {}".format(param.split('__')[1], val)) print("The best accuracy score is {0:.2f}%".format(cv_search.best_score_ * 100)) # Save to pickle cv_search.best_estimator_
def main(): # assays from cluster 0 aids = [ '119', '103', '99', '133', '71', '145', '5', '33', '113', '43', '139', '115', '55', '31', '67', '81', '143', '87', '109', '129', '39', '49', '137', '65', '79', '93', '57', '15', '107', '59', '37', '101', '123', '41', '45', '7', '83', '91', '53', '13', '21', '95', '105', '9', '131', '125', '97', '29', '121', '3', '25', '141', '23', '77', '19', '1', '47', '73', '35', '89', '85' ] aids = list(map(int, aids)) import os ds_test = pd.read_csv(os.getenv('QSAR_DATA') + 'reach_curated.csv') ds_test.index = ds_test.ECNumber best_models = {} predictions = [] for aid in aids: try: ds = PubChemDataSet(aid).clean_load() y = ds.Activity X = PubChemDataSetDescriptors(ds).load_rdkit() # TODO: put this into a cleaner step # remove null values y = y[X.notnull().all(1)] X = X[X.notnull().all(1)] # TODO: put this into a cleaner step # remove null values y = y[~np.isinf(X.values).any(1)] X = X[~np.isinf(X.values).any(1)] print("=======building model for aid {0}=======".format(aid)) print( "======={0} compounds: {1} active, {2} inactive=======".format( y.shape[0], (y == 1).sum(), (y == 0).sum())) except: print("error on aid {0}".format(aid)) continue for name, clf in SKLearnModels.CLASSIFIERS: pipe = Pipeline(list(SKLearnModels.PREPROCESS) + [(name, clf)]) print("=======5-fold CV on {0}=======".format(name)) parameters = SKLearnModels.PARAMETERS[name] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=0) cv_search.fit(X.values, y.values) print("================================") print("The best parameters for {0} are :\n{1}".format( name, cv_search.best_params_)) print("The best score is {0}".format(cv_search.best_score_)) best_models[aid] = cv_search.best_estimator_ X_test = PubChemDataSetDescriptors(ds_test).load_rdkit() # save null or inf values dropped_cmps = X_test[~(X_test.notnull().all(1)) | (np.isinf(X_test.values).any(1))] # remove null and inf values X_test = X_test[X_test.notnull().all(1)] X_test = X_test[~np.isinf(X_test.values).any(1)] print("Making predictions on {0} compounds".format(X_test.shape[0])) preds = pd.DataFrame(cv_search.predict(X_test), index=X_test.index, columns=[aid]) predictions.append(preds) print(pd.concat(predictions, axis=1)) import os filename = os.getenv('QSAR_DATA') + 'reach_predictions_cluster_0.csv' pd.concat(predictions, axis=1).to_csv(filename) m = pd.DataFrame(best_models, index=aids) m.to_pickle(os.getenv('QSAR_DATA') + 'models.csv')
def main(): aids = [119, 79, 83, 7, 37, 99, 129, 59, 41] profile = DS.profile_3.load() best_models = {} predictions = [] for aid in aids: try: ds = PubChemDataSet(aid).clean_load() y = ds.Activity X = PubChemDataSetDescriptors(ds).load_rdkit() # TODO: put this into a cleaner step # remove null values y = y[X.notnull().all(1)] X = X[X.notnull().all(1)] # TODO: put this into a cleaner step # remove null values y = y[~np.isinf(X.values).any(1)] X = X[~np.isinf(X.values).any(1)] print("=======building model for aid {0}=======".format(aid)) print( "======={0} compounds: {1} active, {2} inactive=======".format( y.shape[0], (y == 1).sum(), (y == 0).sum())) except: print("error on aid {0}".format(aid)) continue for name, clf in SKLearnModels.CLASSIFIERS: pipe = Pipeline(list(SKLearnModels.PREPROCESS) + [(name, clf)]) print("=======5-fold CV on {0}=======".format(name)) parameters = SKLearnModels.PARAMETERS[name] cv_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=0) cv_search.fit(X.values, y.values) print("================================") print("The best parameters for {0} are :\n{1}".format( name, cv_search.best_params_)) print("The best score is {0}".format(cv_search.best_score_)) best_models[aid] = cv_search.best_estimator_ ds_test = profile.get_subprofile([aid]).get_nulls().as_ds() X_test = PubChemDataSetDescriptors(ds_test).load_rdkit() # save null or inf values dropped_cmps = X_test[~(X_test.notnull().all(1)) | (np.isinf(X_test.values).any(1))] # remove null and inf values X_test = X_test[X_test.notnull().all(1)] X_test = X_test[~np.isinf(X_test.values).any(1)] print("Making predictions on {0} compounds".format(X_test.shape[0])) preds = pd.DataFrame(cv_search.predict(X_test), index=X_test.index, columns=[aid]) predictions.append(preds) print(pd.concat(predictions, axis=1)) import os filename = os.getenv('QSAR_DATA') + 'missing_data_predictions.csv' pd.concat(predictions, axis=1).to_csv(filename)
def setup(self): """ PubChem AID 1224861 with 25 actives and 109 inactives """ self.ds = PubChemDataSet(1).clean_load()
def test_StructureChecker_cleaner(self): """ testing structure checker """ ds = PubChemDataSet(1).load() pipe = PubChemDataSetCleaner(steps=[StructureCleaner()]) ds = pipe.run(ds) assert None not in ds.rdkit.values