Esempio n. 1
0
def doMolIsoF(molNdx):
    componentResults = []
    xvalResults = []
    portionResults = []

    molName = molfiles[molNdx][1]  # [molfiles[molNdx].rfind("/", 0, -1)+1:-1]
    if molName in done:
        return

    for portion in datasetPortion:
        try:
            descTypes = ["usr", "esh", "es5"]
            descType = descTypes[2]
            if portion <= 1:
                print("Loading " + str(portion * 100) + "% of " +
                      molfiles[molNdx][1])
            else:
                print("Loading " + str(portion) + " actives from " +
                      molfiles[molNdx][1])

            (test_ds,
             test_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                              portion * 0.2,
                                              dtype=descType,
                                              active_decoy_ratio=-1,
                                              selection_policy="RANDOM",
                                              return_type="SEPERATE")
            numcols = test_ds[0][0].shape[1] - 2

            folds = 5

            (n_fold_ds,
             n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                                portion * 0.8,
                                                dtype=descType,
                                                active_decoy_ratio=-1,
                                                selection_policy="RANDOM",
                                                return_type="SEPARATE",
                                                exclusion_list=test_paths)

            (folds_list, excl_list) = cu.split(n_fold_ds,
                                               folds,
                                               policy="RANDOM")

            componentResults = []
            for param in params:
                foldResults = []

                for fold in range(0, folds):
                    try:
                        val_ds = folds_list[fold]

                        train_ds = None

                        for i in range(0, folds):
                            if i != fold:
                                if train_ds is None:
                                    train_ds = [
                                        r[0].iloc[0:1, :]
                                        for r in folds_list[i]
                                    ]  # Take only LEC
                                else:
                                    train_ds.append([
                                        r[0].iloc[0:1, :]
                                        for r in folds_list[i]
                                    ])

                        train_ds = cu.joinDataframes(train_ds)

                        numcols = train_ds.shape[1] - 2

                        clf = IsolationForest(n_estimators=param, n_jobs=-1)

                        train_a = train_ds[train_ds["active"] == True]

                        clf.fit(train_a.iloc[:, 0:numcols], None)

                        results = pd.DataFrame()

                        results["score"] = [
                            max(
                                clf.decision_function(
                                    x[0].iloc[:, 0:numcols]).ravel())
                            for x in val_ds
                        ]

                        results["truth"] = [x[2] for x in val_ds]

                        auc = eval.plotSimROC(np.array(results["truth"]),
                                              np.array([results["score"]]), "",
                                              None)
                        mean_ef = eval.getMeanEFs(np.array(results["truth"]),
                                                  np.array([results["score"]]),
                                                  eval_method="sim")
                        foldResults.append((auc, mean_ef))
                    except:
                        foldResults.append((0, {0.01: 0, 0.05: 0}))

                print("X-Validation results: ")
                print(foldResults)

                if len(foldResults) > 0:
                    mean_auc_sim = np.mean([x[0] for x in foldResults])
                    std_auc_sim = np.std(np.mean([x[0] for x in foldResults]))
                    mean_mean_ef_1pc = np.mean(
                        [x[1][0.01] for x in foldResults])
                    std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults])
                    mean_mean_ef_5pc = np.mean(
                        [x[1][0.05] for x in foldResults])
                    std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults])

                    print("mean AUC=" + str(mean_auc_sim) + ", std=" +
                          str(std_auc_sim) + ", mean EF(1%)=" +
                          str(mean_mean_ef_1pc) + ", std=" +
                          str(std_mean_ef_1pc) + ", mean EF(5%)=" +
                          str(mean_mean_ef_5pc) + ", std=" +
                          str(std_mean_ef_5pc))

                    componentResults.append(
                        (molName, portion, param, mean_auc_sim, std_auc_sim,
                         mean_mean_ef_1pc, std_mean_ef_1pc, mean_mean_ef_5pc,
                         std_mean_ef_5pc))
                else:
                    print(
                        "X-Validation returned no results. Skipping training..."
                    )
                    componentResults.append(
                        (molName, portion, param, 0, 0, 0, 0, 0, 0))

        except:
            componentResults.append(
                (molName, portion, param, 0, 0, 0, 0, 0, 0))

        xvalResults.extend(componentResults)

        # Find best score
        aucs_rank = [x[5] for x in componentResults]

        best_estimators = params[np.argmax(aucs_rank)]
        print("Best-score estimators no.: " + str(best_estimators))

        #train_ds = cu.lumpRecords(n_fold_ds)
        train_ds = cu.joinDataframes([r[0].iloc[0:1, :] for r in n_fold_ds])

        t0 = time.time()
        clf = IsolationForest(n_estimators=best_estimators, n_jobs=-1)

        train_a = train_ds[train_ds["active"] == True]

        clf.fit(train_a.iloc[:, 0:numcols], None)

        results = pd.DataFrame()

        results["score"] = [
            max(clf.decision_function((x[0].iloc[:, 0:numcols])))
            for x in test_ds
        ]
        results["truth"] = [x[2] for x in test_ds]  #np.array(test_ds)[:, 2]

        auc = eval.plotSimROC(
            results["truth"], [results["score"]],
            molName + "[IsoForest, " + str(portion * 100) + "%]", "results/" +
            molName + "_IsoForest_sim_" + str(portion * 100) + ".pdf")
        auc_rank = eval.plotRankROC(
            results["truth"], [results["score"]],
            molName + "[IsoForest, " + str(portion * 100) + "%]", "results/" +
            molName + "_IsoForest_rank_" + str(portion * 100) + ".pdf")

        auc = eval.plotSimROC(
            results["truth"], [results["score"]],
            molName + "[IsoForest, " + str(portion * 100) + "%]",
            molName + "_IsoForest_sim_" + str(portion * 100) + ".pdf")
        mean_ef = eval.getMeanEFs(np.array(results["truth"]),
                                  np.array([results["score"]]),
                                  eval_method="sim")

        print("AUC(Sim)=" + str(auc))
        print("EF: ", mean_ef)

        t1 = time.time()
        print("Time taken = " + str(t1 - t0))

        portionResults.append((molName, portion, best_estimators, auc,
                               auc_rank, mean_ef, t1 - t0))

    print(xvalResults)
    print(portionResults)

    f1 = open("results/results_isoF_" + molName + ".txt", 'w')
    print(xvalResults, file=f1)
    print(portionResults, file=f1)
    f1.close()

    full_train_dss = [x[0].iloc[0:1, :] for x in test_ds]
    full_train_dss.append([x[0].iloc[0:1, :] for x in n_fold_ds])
    full_train_ds = cu.joinDataframes(full_train_dss)
    clf = IsolationForest(n_estimators=best_estimators, n_jobs=-1)

    G_a = clf.fit(full_train_ds.iloc[:, 0:numcols],
                  full_train_ds.iloc[:, numcols])

    import pickle
    mdlf = open("results/" + molName + "_IsoForest.pkl", "wb")
    pickle.dump(G_a, mdlf)
    mdlf.close()

    print("Saved model for " + molName + " to disk")
Esempio n. 2
0
for molNdx in range(2, len(molfiles)):
    for portion in datasetPortion:
        #try:
        t0 = time.time()
        descTypes = ["usr", "esh", "es5"]
        descType = descTypes[1]
        if portion <= 1:
            print("Loading " + str(portion * 100) + "% of " +
                  molfiles[molNdx][1])
        else:
            print("Loading " + str(portion) + " actives from " +
                  molfiles[molNdx][1])

        (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                                   portion * 0.2,
                                                   dtype=descType,
                                                   active_decoy_ratio=-1,
                                                   selection_policy="RANDOM",
                                                   return_type="SEPERATE")
        numcols = test_ds[0][0].shape[1] - 2

        folds = 3
        componentResults = []

        (n_fold_ds,
         n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                            portion * 0.8,
                                            dtype=descType,
                                            active_decoy_ratio=-1,
                                            selection_policy="RANDOM",
                                            return_type="SEPARATE",
                                            exclusion_list=test_paths)
Esempio n. 3
0
                  molfiles[molNdx][1])
        else:
            print("Loading " + str(portion) + " actives from " +
                  molfiles[molNdx][1])

        auc_esh = 0
        auc_rank_esh = 0
        mean_ef_esh = 0

        try:
            print("Processing Electroshape 5-d - " + str(portion))
            t0 = time.time()
            (sim_es5_ds,
             sim_paths_es5) = cu.loadDescriptors(molfiles[molNdx][0],
                                                 portion,
                                                 dtype="es5",
                                                 active_decoy_ratio=-1,
                                                 selection_policy="SEQUENTIAL",
                                                 return_type="SEPARATE")
            simobj_es5 = scls.USRMoleculeSimParallel(sim_es5_ds, sim_paths_es5)
            usr_results_es5 = np.array(simobj_es5.runScreening(50)).transpose()

            #plotSimROC(sim_es5_ds, usr_results_es5, "es5_plot_"+molfiles[molNdx][1]+".pdf")
            # (auc_es5, mean_ef_es5) = eval.plotSimROC([l[2] for l in sim_ds], usr_results_es5,
            #                                  molName + "ElectroShape 5-d results",
            #                                  "es5_plot_"+molName + ".pdf")

            auc_es5 = eval.plotSimROC([l[2] for l in sim_ds], usr_results_es5,
                                      molName + " ElectroShape 5-d Sim ROC",
                                      "results/es5_sim_" + molName + ".pdf")
            auc_rank_es5 = eval.plotRankROC(
                [l[2] for l in sim_ds], usr_results_es5,
Esempio n. 4
0
def doMolGMM(molNdx):
    portionResults=[]
    xvalResults=[]
    molName = molfiles[molNdx][1]  # [molfiles[molNdx].rfind("/", 0, -1)+1:-1]
    for portion in datasetPortion:
 #       try:
        descTypes = ["usr", "esh", "es5"]
        descType = descTypes[1]
        if portion <= 1:
            print("Loading " + str(portion * 100) + "% of " + molfiles[molNdx][1])
        else:
            print("Loading " + str(portion) + " actives from " + molfiles[molNdx][1])

        (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.2, dtype=descType,
                                                   active_decoy_ratio=-1, selection_policy="RANDOM",
                                                   return_type="SEPERATE")
        numcols = test_ds[0][0].shape[1] - 2

        componentsValues = [1, 10, 50, 100, 1000]
        folds = 5

        (n_fold_ds, n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType,
                                                           active_decoy_ratio=-1,
                                                           selection_policy="RANDOM", return_type="SEPARATE",
                                                           exclusion_list=test_paths)

        (folds_list, excl_list) = cu.split(n_fold_ds, folds, policy="RANDOM")

        componentResults = []

        for components in componentsValues:
            foldResults = []

            for fold in range(0, folds):
                val_ds = folds_list[fold]

                train_ds = None;

                for i in range(0, folds):
                    if i != fold:
                        if train_ds is None:
                            train_ds = [r[0].iloc[0:1,:] for r in folds_list[i]]
                        else:
                            train_ds.append([r[0].iloc[0:1,:]  for r in folds_list[i]])

                train_ds = cu.joinDataframes(train_ds)

                numcols = train_ds.shape[1] - 2

                train_a = train_ds[train_ds["active"] == True]
                    # train_d = train_ds[train_ds["active"]==False]

                if len(train_a) > components:
                        # print("Generating GMM for actives...")
                    G_a = GaussianMixture(n_components=components, covariance_type="full").fit(
                        train_a.iloc[:, 0:numcols], train_a.iloc[:, numcols])

                    results = pd.DataFrame()

                    print(numcols)
                    results["a_score"] = [G_a.score(x[0].iloc[:, 0:numcols]) for x in
                                              val_ds]  # map(lambda x: G_a.score(x[0].iloc[:, 0:12]), test_ds)
                    results["truth"] = [x[2] for x in val_ds]  # np.array(val_ds)[:,2]

                    auc = eval.plotSimROC(np.array(results["truth"]), np.array([results["a_score"]]), "", None)
                    mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["a_score"]]))

                    foldResults.append((auc, mean_ef))
                else:
                    print("Training samples(" + str(len(train_a)) + ") < GMM components(" + str(
                        components) + ") -> cannot train.")
                    break
                        # foldResults.append(0)

            print("X-Validation results, num components = " + str(components) + ": ")
            print(foldResults)

            if len(foldResults) > 0:
                mean_auc_sim = np.mean([x[0] for x in foldResults])
                std_auc_sim = np.std(np.mean([x[0] for x in foldResults]))
                mean_mean_ef_1pc = np.mean([x[1][0.01] for x in foldResults])
                std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults])
                mean_mean_ef_5pc = np.mean([x[1][0.05] for x in foldResults])
                std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults])

                print("mean AUC=" + str(mean_auc_sim) +
                          ", std=" + str(std_auc_sim) +
                          ", mean EF(1%)=" + str(mean_mean_ef_1pc) +
                          ", std=" + str(std_mean_ef_1pc) +
                          ", mean EF(5%)=" + str(mean_mean_ef_5pc) +
                          ", std=" + str(std_mean_ef_5pc))

                componentResults.append((molName, portion, components, mean_auc_sim, std_auc_sim, mean_mean_ef_1pc,
                                             std_mean_ef_1pc, mean_mean_ef_5pc, std_mean_ef_5pc))
            else:
                print(
                        "X-Validation returned no results for " + str(components) + " components. Skipping training...")
                componentResults.append((molName, portion, components, 0, 0, 0, 0, 0, 0))
            # print(componentResults)

        xvalResults.extend(componentResults)
            # Find best score
        aucs_rank = [x[5] for x in componentResults]

        best_components = componentsValues[np.argmax(aucs_rank)]
        print("Best-score compnents no.: " + str(best_components))

        (train_ds, train_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType,
                                                         active_decoy_ratio=0,
                                                         selection_policy="RANDOM", return_type="SEPARATE",
                                                         exclusion_list=test_paths)

        train_ds = [x[0][0:1,:] for x in train_ds] #extract LECs
		

        # molName = molfiles[molNdx][1]#[molfiles[molNdx].rfind("/", 0, -1)+1:-1]
        if len(train_ds) > best_components:
            t0 = time.time()
            G_a = GaussianMixture(n_components=best_components, covariance_type="full").fit(
                train_ds.iloc[:, 0:numcols], train_ds.iloc[:, numcols])
            results = pd.DataFrame()
            results["a_score"] = [G_a.score(x[0].iloc[:, 0:numcols]) for x in test_ds]
            results["truth"] = [x[2] for x in test_ds]  # np.array(test_ds)[:, 2]
            auc = eval.plotSimROC(results["truth"], [results["a_score"]],
                                      molName + "[GMM-" + str(components) + " components(Similarity), " + str(
                                          portion * 100) + "%]",
                                      "results/"+molName + "_GMM_sim_" + str(components) + "_" + str(portion * 100) + "_LEC.pdf")
            auc_rank = eval.plotRankROC(results["truth"], [results["a_score"]],
                                            molName + "[GMM-" + str(components) + " components(Similarity), " + str(
                                                portion * 100) + "%]",
                                            "results/"+molName + "_GMM_sim_" + str(components) + "_" + str(portion * 100) + "_LEC.pdf")
            mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["a_score"]]))
            t1 = time.time()
        else:
            auc = 0
            mean_ef = 0

        print("Final results, num components = ", str(components) + ": ")
        print("AUC=" + str(auc))
        print("EF: ", mean_ef)

        portionResults.append((molName, portion, best_components, auc, auc_rank, mean_ef, t1 - t0))
#        except:
#            print("Exception")
#            portionResults.append((molName, portion, 0, 0, 0, 0, 0))

        f1 = open("results/results_gmm_"+molName+"_LEC.txt", 'w')
        print(xvalResults, file=f1)
        print(portionResults, file=f1)
        f1.close()

        full_train_dss = [x[0].iloc[0:1,:] for x in test_ds]
        full_train_dss.append([x[0].iloc[0:1,:] for x in n_fold_ds])
        full_train_ds = cu.joinDataframes(full_train_dss)
        G_a = GaussianMixture(n_components=best_components, covariance_type="full").fit(
            full_train_ds.iloc[:, 0:numcols],
            full_train_ds.iloc[:, numcols])
        import pickle
        mdlf = open("results/"+molName + "_GMM_LEC.pkl", "wb")
        pickle.dump(G_a, mdlf)
        mdlf.close()
        print("Saved model for " + molName + " to disk")
Esempio n. 5
0
def doMolGMM(molNdx):
    componentResults = []
    portionResults = []

    molName = molfiles[molNdx][1]  # [molfiles[molNdx].rfind("/", 0, -1)+1:-1]
    for portion in datasetPortion:
        t0 = time.time()
        descTypes = ["usr", "esh", "es5"]
        descType = descTypes[1]
        if portion <= 1:
            print("Loading " + str(portion * 100) + "% of " +
                  molfiles[molNdx][1])
        else:
            print("Loading " + str(portion) + " actives from " +
                  molfiles[molNdx][1])

        (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                                   portion * 0.2,
                                                   dtype=descType,
                                                   active_decoy_ratio=-1,
                                                   selection_policy="RANDOM",
                                                   return_type="SEPERATE")
        numcols = test_ds[0][0].shape[1] - 2

        folds = 3

        (n_fold_ds,
         n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                            portion * 0.8,
                                            dtype=descType,
                                            active_decoy_ratio=-1,
                                            selection_policy="RANDOM",
                                            return_type="SEPARATE",
                                            exclusion_list=test_paths)

        (folds_list, excl_list) = cu.split(n_fold_ds, folds, policy="RANDOM")

        foldResults = []

        for fold in range(0, folds):

            val_ds = folds_list[fold]

            train_ds = None

            for i in range(0, folds):
                if i != fold:
                    if train_ds is None:
                        train_ds = [r[0] for r in folds_list[i]]
                    else:
                        train_ds.append([r[0] for r in folds_list[i]])

            train_ds = cu.joinDataframes(train_ds)

            numcols = train_ds.shape[1] - 2

            ann = MLPRegressor(max_iter=1000, early_stopping=True)

            ann.fit(train_ds.iloc[:, 0:numcols],
                    ((train_ds["active"])).astype(int) * 100)

            results = pd.DataFrame()

            results["score"] = [
                max(ann.predict(x[0].iloc[:, 0:numcols])) for x in val_ds
            ]
            results["truth"] = [x[2] for x in val_ds]
            auc = eval.plotSimROC(np.array(results["truth"]),
                                  np.array([results["score"]]), "", None)
            mean_ef = eval.getMeanEFs(np.array(results["truth"]),
                                      np.array([results["score"]]))
            foldResults.append((auc, mean_ef))

        print("X-Validation results: ")
        print(foldResults)

        if len(foldResults) > 0:
            mean_auc_sim = np.mean([x[0] for x in foldResults])
            std_auc_sim = np.std(np.mean([x[0] for x in foldResults]))
            mean_mean_ef_1pc = np.mean([x[1][0.01] for x in foldResults])
            std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults])
            mean_mean_ef_5pc = np.mean([x[1][0.05] for x in foldResults])
            std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults])

            print("mean AUC=" + str(mean_auc_sim) + ", std=" +
                  str(std_auc_sim) + ", mean EF(1%)=" + str(mean_mean_ef_1pc) +
                  ", std=" + str(std_mean_ef_1pc) + ", mean EF(5%)=" +
                  str(mean_mean_ef_5pc) + ", std=" + str(std_mean_ef_5pc))

            componentResults.append(
                (molName, portion, mean_auc_sim, std_auc_sim, mean_mean_ef_1pc,
                 std_mean_ef_1pc, mean_mean_ef_5pc, std_mean_ef_5pc))
        else:
            print("X-Validation returned no results. Skipping training...")
            componentResults.append((molName, portion, 0, 0, 0, 0, 0, 0))

        train_ds = cu.lumpRecords(n_fold_ds)
        ann = MLPRegressor(max_iter=1000, early_stopping=True)
        ann.fit(train_ds.iloc[:, 0:numcols],
                ((train_ds["active"])).astype(int) * 100)

        results = pd.DataFrame()

        results["score"] = [
            max(ann.predict(x[0].iloc[:, 0:numcols])) for x in test_ds
        ]
        results["truth"] = [x[2] for x in test_ds]  #np.array(test_ds)[:, 2]

        auc_sim = eval.plotSimROC(
            results["truth"], [results["score"]],
            molName + "[ANN, " + str(portion * 100) + "%]",
            "results/" + molName + "_ANN_sim_" + str(portion * 100) + ".pdf")
        auc_rank = eval.plotRankROC(
            results["truth"], [results["score"]],
            molName + "[ANN-" + str(portion * 100) + "%]",
            "results/" + molName + "_ANN_rank_" + str(portion * 100) + ".pdf")

        mean_ef = eval.getMeanEFs(np.array(results["truth"]),
                                  np.array([results["score"]]))

        print("AUC(Sim)=" + str(auc))
        print("EF: ", mean_ef)
        t1 = time.time()

        portionResults.append(
            (molName, portion, auc_sim, auc_rank, mean_ef, (t1 - t0)))

        print("Time taken = " + str(t1 - t0))

        print(componentResults)
        print(portionResults)

        f1 = open("results/results_ann_" + molName + ".txt", 'w')
        print(componentResults, file=f1)
        print(portionResults, file=f1)
        f1.close()
Esempio n. 6
0
molNdx=0

#(sim_ds, sim_paths) = cu.loadDescriptors(molfiles[molNdx][0], numActives, dtype="usr", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE")
#(sim_es_ds, sim_paths_es) = cu.loadDescriptors(molfiles[molNdx][0], numActives, dtype="esh", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE")
#(sim_es5_ds, sim_paths_es5) = cu.loadDescriptors(molfiles[molNdx][0], numActives, dtype="es5", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE")

results = []
for molNdx in range(0, len(molfiles)):

    molName = molfiles[molNdx][1]
    try:
        print("Processing "+molfiles[molNdx][0])
        print("Processing USR")
        sc = initSpark()
        (sim_ds, sim_paths) = cu.loadDescriptors(molfiles[molNdx][0], numActives, dtype="usr", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE")
        simobj = scls.USRMoleculeSim(sim_ds, sim_paths)
        usr_results = np.array(simobj.runSparkScreening(sc)).transpose()
        sc.stop()
        #plotSimROC(sim_ds, usr_results, "usr_plot_"+molfiles[molNdx][1]+".pdf")
        auc_usr = eval.plotSimROC([l[2] for l in sim_ds], usr_results,
                                         molName + " USR Sim ROC",
                                         "usr_sim_"+molName + ".pdf")
        auc_rank_usr = eval.plotRankROC([l[2] for l in sim_ds], usr_results,
                                         molName + " USR Rank ROC",
                                         "usr_rank_"+molName + ".pdf")
        mean_ef_usr = eval.getMeanEFs([l[2] for l in sim_ds], usr_results)
    except:
        print("Error processing USR for " + molfiles[molNdx][1])
        auc_usr=0
        auc_rank_usr=0