Ejemplo n.º 1
0
    def test_PubChemDataSet_creation(self):
        """ testing PubChemDataSet creation """

        # some cids from PubChem AID 1
        active_compounds = [11969872, 390525, 394646]
        inactive_compounds = [1018, 4775, 219294]
        ds = PubChemDataSet(1)
        df = ds.get_compounds()

        assert all(df.loc[active_compounds, 'Activity'] == 1)
        assert all(df.loc[inactive_compounds, 'Activity'] == 0)
Ejemplo n.º 2
0
    def test_PubChemDataSet_clean_load(self):
        """ testing a clean loaded dataset """

        df = PubChemDataSet(1).clean_load()
        assert len((df.Activity[df.Activity == 1])) == len(
            (df.Activity[df.Activity == 0]))
        assert None not in [
            Chem.MolToSmiles(mol) if mol else None for mol in df.rdkit
        ]
Ejemplo n.º 3
0
def build_models(aid, sub_directory):
    best_scores = {}
    aid_sub_directory = os.path.join(sub_directory, str(aid))

    # if its already been modeled move on
    if os.path.exists(aid_sub_directory):
        return
    else:
        os.mkdir(aid_sub_directory)
    try:
        ds = PubChemDataSet(aid).clean_load()
        ds.to_csv('{}/training.csv'.format(aid_sub_directory))
        y = ds.Activity
        X = PubChemDataSetDescriptors(ds).load_ECFP6()
        print("=======building model for aid {0}=======".format(aid))
        print("======={0} compounds: {1} active, {2} inactive=======".format(
            y.shape[0], (y == 1).sum(), (y == 0).sum()))

    except:
        print("error on aid {0}".format(aid))
        return

    for name, clf in SKLearnModels.CLASSIFIERS:
        pipe = Pipeline([(name, clf)])
        print("=======5-fold CV on {0}=======".format(name))
        parameters = SKLearnModels.PARAMETERS[name]
        cv_search = GridSearchCV(pipe,
                                 parameters,
                                 cv=5,
                                 scoring='accuracy',
                                 n_jobs=1,
                                 verbose=0)
        cv_search.fit(X.values, y.values)
        print("================================")
        print("The best parameters for {0} are :\n{1}".format(
            name, cv_search.best_params_))
        print("The best score is {0}".format(cv_search.best_score_))
        best_scores[name] = cv_search.best_score_
        joblib.dump(cv_search.best_estimator_,
                    '{}/{}.pkl'.format(aid_sub_directory, name))

    with open('{}/results.csv'.format(aid_sub_directory), 'w') as results_file:
        for model, score in best_scores.items():
            results_file.write(model + ',' + str(score) + '\n')
Ejemplo n.º 4
0
    def test_PubChemDataSet_load(self):
        """ testing PubChemDataSet load """

        # some smiles from PubChem AID 1
        smiles = ['COC1=CC(=C2C(=C1)OC(=CC2=O)C3(C=CC(=O)C=C3)O)O',
                  'C1=CC=C(C=C1)CCCC(=O)O']

        df = PubChemDataSet(1).load()
        for smi in smiles:
            print(smi in df.SMILES.tolist())
            assert smi in df.SMILES.tolist()
Ejemplo n.º 5
0
    def setup(self):
        """ sets up class with some dummy data """

        self.clfs = [clf[0] for clf in SKLearnModels.CLASSIFIERS]
        self.ds = PubChemDataSet(1224861).load()
        mols = [Chem.MolFromSmiles(smi) for smi in self.ds.SMILES]
        calc = MoleculeDescriptors.MolecularDescriptorCalculator(
            [desc[0] for desc in Descriptors.descList])
        self.X = pd.DataFrame(
            [list(calc.CalcDescriptors(mol)) for mol in mols],
            columns=list(calc.GetDescriptorNames()),
            index=self.ds.index)
Ejemplo n.º 6
0
    def setup(self):
        self.ds = PubChemDataSet(1).clean_load()
        self.y = self.ds.Activity
        self.X = PubChemDataSetDescriptors(self.ds).load_rdkit()
        #print(self.X.shape, self.y.shape)

        self.y = self.y[self.X.notnull().all(1)]
        self.X =  self.X[self.X.notnull().all(1)]
        #print(self.X.shape, self.y.shape)

        self.y = self.y[~np.isinf(self.X.values).any(1)]
        self.X = self.X[~np.isinf(self.X.values).any(1)]
Ejemplo n.º 7
0
def build_model(aid, model):

    try:
        ds = PubChemDataSet(aid).clean_load()
        y = ds.Activity
        X = PubChemDataSetDescriptors(ds).load_rdkit()

        # TODO: put this into a cleaner step
        # remove null values
        y = y[X.notnull().all(1)]
        X = X[X.notnull().all(1)]

        # TODO: put this into a cleaner step
        # remove null values
        y = y[~np.isinf(X.values).any(1)]
        X = X[~np.isinf(X.values).any(1)]
        print("=======building model for aid {0}=======".format(aid))
        print("======={0} compounds: {1} active, {2} inactive=======".format(
            y.shape[0], (y == 1).sum(), (y == 0).sum()))
    except:
        raise Exception("error on aid {0}".format(aid))

    pipe = Pipeline(
        list(SKLearnModels.PREPROCESS) +
        [(name, clf)
         for name, clf in SKLearnModels.CLASSIFIERS if name == model])
    print("=======5-fold CV on {0}=======".format(model))
    parameters = SKLearnModels.PARAMETERS[model]

    cv_search = GridSearchCV(pipe,
                             parameters,
                             cv=5,
                             scoring='accuracy',
                             n_jobs=-1,
                             verbose=0)
    cv_search.fit(X.values, y.values)
    print("================================")
    print("The best parameters for {0} are :".format(model))
    for param, val in cv_search.best_params_.items():
        print("{}: {}".format(param.split('__')[1], val))
    print("The best accuracy score is {0:.2f}%".format(cv_search.best_score_ *
                                                       100))
    # Save to pickle
    cv_search.best_estimator_
Ejemplo n.º 8
0
def main():
    # assays from cluster 0
    aids = [
        '119', '103', '99', '133', '71', '145', '5', '33', '113', '43', '139',
        '115', '55', '31', '67', '81', '143', '87', '109', '129', '39', '49',
        '137', '65', '79', '93', '57', '15', '107', '59', '37', '101', '123',
        '41', '45', '7', '83', '91', '53', '13', '21', '95', '105', '9', '131',
        '125', '97', '29', '121', '3', '25', '141', '23', '77', '19', '1',
        '47', '73', '35', '89', '85'
    ]
    aids = list(map(int, aids))
    import os
    ds_test = pd.read_csv(os.getenv('QSAR_DATA') + 'reach_curated.csv')
    ds_test.index = ds_test.ECNumber
    best_models = {}
    predictions = []
    for aid in aids:
        try:
            ds = PubChemDataSet(aid).clean_load()
            y = ds.Activity
            X = PubChemDataSetDescriptors(ds).load_rdkit()

            # TODO: put this into a cleaner step
            # remove null values
            y = y[X.notnull().all(1)]
            X = X[X.notnull().all(1)]

            # TODO: put this into a cleaner step
            # remove null values
            y = y[~np.isinf(X.values).any(1)]
            X = X[~np.isinf(X.values).any(1)]
            print("=======building model for aid {0}=======".format(aid))
            print(
                "======={0} compounds: {1} active, {2} inactive=======".format(
                    y.shape[0], (y == 1).sum(), (y == 0).sum()))
        except:
            print("error on aid {0}".format(aid))
            continue

        for name, clf in SKLearnModels.CLASSIFIERS:

            pipe = Pipeline(list(SKLearnModels.PREPROCESS) + [(name, clf)])
            print("=======5-fold CV on {0}=======".format(name))
            parameters = SKLearnModels.PARAMETERS[name]

            cv_search = GridSearchCV(pipe,
                                     parameters,
                                     cv=5,
                                     scoring='accuracy',
                                     n_jobs=-1,
                                     verbose=0)
            cv_search.fit(X.values, y.values)
            print("================================")
            print("The best parameters for {0} are :\n{1}".format(
                name, cv_search.best_params_))
            print("The best score is {0}".format(cv_search.best_score_))
            best_models[aid] = cv_search.best_estimator_

        X_test = PubChemDataSetDescriptors(ds_test).load_rdkit()

        # save null or inf values
        dropped_cmps = X_test[~(X_test.notnull().all(1)) |
                              (np.isinf(X_test.values).any(1))]

        # remove null and inf values
        X_test = X_test[X_test.notnull().all(1)]
        X_test = X_test[~np.isinf(X_test.values).any(1)]
        print("Making predictions on {0} compounds".format(X_test.shape[0]))
        preds = pd.DataFrame(cv_search.predict(X_test),
                             index=X_test.index,
                             columns=[aid])
        predictions.append(preds)
    print(pd.concat(predictions, axis=1))
    import os
    filename = os.getenv('QSAR_DATA') + 'reach_predictions_cluster_0.csv'
    pd.concat(predictions, axis=1).to_csv(filename)

    m = pd.DataFrame(best_models, index=aids)
    m.to_pickle(os.getenv('QSAR_DATA') + 'models.csv')
Ejemplo n.º 9
0
def main():
    aids = [119, 79, 83, 7, 37, 99, 129, 59, 41]
    profile = DS.profile_3.load()
    best_models = {}
    predictions = []
    for aid in aids:
        try:
            ds = PubChemDataSet(aid).clean_load()
            y = ds.Activity
            X = PubChemDataSetDescriptors(ds).load_rdkit()

            # TODO: put this into a cleaner step
            # remove null values
            y = y[X.notnull().all(1)]
            X = X[X.notnull().all(1)]

            # TODO: put this into a cleaner step
            # remove null values
            y = y[~np.isinf(X.values).any(1)]
            X = X[~np.isinf(X.values).any(1)]
            print("=======building model for aid {0}=======".format(aid))
            print(
                "======={0} compounds: {1} active, {2} inactive=======".format(
                    y.shape[0], (y == 1).sum(), (y == 0).sum()))
        except:
            print("error on aid {0}".format(aid))
            continue

        for name, clf in SKLearnModels.CLASSIFIERS:

            pipe = Pipeline(list(SKLearnModels.PREPROCESS) + [(name, clf)])
            print("=======5-fold CV on {0}=======".format(name))
            parameters = SKLearnModels.PARAMETERS[name]

            cv_search = GridSearchCV(pipe,
                                     parameters,
                                     cv=5,
                                     scoring='accuracy',
                                     n_jobs=-1,
                                     verbose=0)
            cv_search.fit(X.values, y.values)
            print("================================")
            print("The best parameters for {0} are :\n{1}".format(
                name, cv_search.best_params_))
            print("The best score is {0}".format(cv_search.best_score_))
            best_models[aid] = cv_search.best_estimator_

        ds_test = profile.get_subprofile([aid]).get_nulls().as_ds()
        X_test = PubChemDataSetDescriptors(ds_test).load_rdkit()

        # save null or inf values
        dropped_cmps = X_test[~(X_test.notnull().all(1)) |
                              (np.isinf(X_test.values).any(1))]

        # remove null and inf values
        X_test = X_test[X_test.notnull().all(1)]
        X_test = X_test[~np.isinf(X_test.values).any(1)]
        print("Making predictions on {0} compounds".format(X_test.shape[0]))
        preds = pd.DataFrame(cv_search.predict(X_test),
                             index=X_test.index,
                             columns=[aid])
        predictions.append(preds)
    print(pd.concat(predictions, axis=1))
    import os
    filename = os.getenv('QSAR_DATA') + 'missing_data_predictions.csv'
    pd.concat(predictions, axis=1).to_csv(filename)
Ejemplo n.º 10
0
 def setup(self):
     """ PubChem AID 1224861 with
         25 actives and 109 inactives
     """
     self.ds = PubChemDataSet(1).clean_load()
Ejemplo n.º 11
0
 def test_StructureChecker_cleaner(self):
     """ testing structure checker """
     ds = PubChemDataSet(1).load()
     pipe = PubChemDataSetCleaner(steps=[StructureCleaner()])
     ds = pipe.run(ds)
     assert None not in ds.rdkit.values