Exemple #1
0
def macro_bedroc(y_true, y_pred, a=20):
    """
    Helper function which calculates macro averaged BEDROC score using
    ML.Scoring.Scoring.CalcBEDROC from rdkit

    Args:
        y_true: DataFrame of known labels, each row corresponds to a compound
            and each column corresponds to a target label
        y_pred: DataFrame of predicted label probabilities, rows and columns
            should match known DataFrame
        a: alpha value for BEDROC calculation. NOTE:only scores computed using
            the same alpha value can be compared
    Returns:
        The macro averaged BEDROC score for the predicted labels

    """
    bedroc_scores = []
    for column in y_true:
        if np.sum(y_true[column]) != 0:
            scores = pd.DataFrame()
            scores['proba'] = np.array(y_pred[column])
            scores['active'] = np.array(y_true[column])
            scores.sort_values(by='proba', ascending=False, inplace=True)
            bedroc_scores.append(
                Scoring.CalcBEDROC(np.array(scores), col=1, alpha=a))
        else:
            continue

    macro_bedroc_score = np.mean(bedroc_scores)

    return macro_bedroc_score
Exemple #2
0
 def test4(self):
     """ test BEDROC """
     # best case
     bedroc = Scoring.CalcBEDROC(self.scoreBestCase, self.index, self.alpha)
     self.assertAlmostEqual(bedroc, 1.0, self.acc)
     # worst case
     bedroc = Scoring.CalcBEDROC(self.scoreWorstCase, self.index, self.alpha)
     self.assertAlmostEqual(bedroc, 0.0, self.acc)
     # empty list
     self.assertRaises(ValueError, Scoring.CalcBEDROC, self.scoreEmptyList, self.index, self.alpha)
     # alpha == 0.0
     self.assertRaises(ValueError, Scoring.CalcBEDROC, self.scoreBestCase, self.index, 0.0)
     # all actives
     bedroc = Scoring.CalcBEDROC(self.scoreAllActives, self.index, self.alpha)
     self.assertEqual(bedroc, 1.0)
     # all decoys
     bedroc = Scoring.CalcBEDROC(self.scoreAllDecoys, self.index, self.alpha)
     self.assertEqual(bedroc, 0.0)
Exemple #3
0
def metrics_for_target(pred, actual, mask):
    mask = np.array(mask, dtype=np.bool)
    masked_preds = pred.squeeze()[mask]
    order = np.flipud(np.argsort(masked_preds))
    masked_oredered_actual = actual[mask][order]
    return Scoring.CalcEnrichment(
        masked_oredered_actual, 0, [.001, .005, .01, .05]) + [
            Scoring.CalcAUC(masked_oredered_actual, 0),
            Scoring.CalcBEDROC(masked_oredered_actual, 0, 20)
        ]
def evaluate(activity_arr):
    auc = Scoring.CalcAUC(activity_arr, 0)
    print("AUC: ", auc)
    ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
    print("EF for 1%: ", ef[0])
    ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
    print("EF for 5%: ", ef[0])
    rie = Scoring.CalcRIE(activity_arr, 0, 100)
    print("RIE for 100: ", rie)
    bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
    print("BEDROC for 100: ", bedroc)
def evaluation(activity_arr: list, output_file: str):
    inputoutput_utils.create_parent_directory(output_file)
    auc = Scoring.CalcAUC(activity_arr, 0)
    ef1 = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
    ef5 = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
    rie = Scoring.CalcRIE(activity_arr, 0, 100)
    bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
    output = {
        "AUC": auc,
        "EF1": ef1[0],
        "EF5": ef5[0],
        "RIE": rie,
        "BEDROC": bedroc
    }
    with open(output_file, "w", encoding="utf-8") as stream:
        json.dump(output, stream)
def evaluation(activity_arr: list, output_file: str):
    with open(output_file, "w") as stream:
        auc = Scoring.CalcAUC(activity_arr, 0)
        stream.write("AUC: ")
        stream.write(str(auc))
        ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
        stream.write("\nEF for 1%: ")
        stream.write(str(ef[0]))
        ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
        stream.write("\nEF for 5%: ")
        stream.write(str(ef[0]))
        rie = Scoring.CalcRIE(activity_arr, 0, 100)
        stream.write("\nRIE for 100: ")
        stream.write(str(rie))
        bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
        stream.write("\nBEDROC for 100: ")
        stream.write(str(bedroc))
Exemple #7
0
 def process_results(target):
     header = []
     print "###Checking target %s..." % target
     labels, rocs, results_dict = build_rocs(target,
                                             plot=not (writefile))
     if labels == rocs == None: return None
     print "Data loaded"
     if not header:
         header = labels
     auc_row = [target]
     ef1_row = [target]
     ef1r_row = [target]
     bedroc_row = [target]
     for roc in rocs:
         auc_row.append(roc.auc())
         ef1_row.append(get_EF(roc, 10, relative=True))
         ef1r_row.append(get_EF(roc, 1, relative=True))
         bedroc_row.append(Scoring.CalcBEDROC(roc.data, 0, 20))
     return header, auc_row, ef1_row, ef1r_row, bedroc_row, target
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(description='Tune KNeighborsClassifier')
    parser.add_argument('-X',
                        '--X_data',
                        action='store',
                        nargs=2,
                        dest='X',
                        help='Input features for the model (.csv format)')
    parser.add_argument('-y',
                        '--y_data',
                        action='store',
                        nargs=2,
                        dest='y',
                        help='Target outputs for the model (.csv format)')
    parser.add_argument('-i',
                        '--input_directory',
                        action='store',
                        nargs=1,
                        dest='input',
                        default=['./'],
                        help='Directory where input files are stored')
    parser.add_argument('-o',
                        '--output_directory',
                        action='store',
                        nargs=1,
                        dest='output',
                        default=['./'],
                        help='Directory where output files should be written')
    args = vars(parser.parse_args())

    #Sort so that training and test data are in a predictable order
    args['X'].sort()
    args['y'].sort()

    X_train = pd.read_csv(args['input'][0] + args['X'][1]) \
    .drop(columns=['smiles'])
    y_train = pd.read_csv(args['input'][0] + args['y'][1])

    X_test = pd.read_csv(args['input'][0] + args['X'][0]) \
    .drop(columns=['smiles'])
    y_test = pd.read_csv(args['input'][0] + args['y'][0])

    # use a full grid over all parameters
    param_grid = {
        'n_neighbors': [1, 5, 10],
        'metric': ['minkowski', 'jaccard']
    }

    results = []
    for params in list(ParameterGrid(param_grid)):
        clf = KNeighborsClassifier(n_jobs=-1,
                                   n_neighbors=params['n_neighbors'],
                                   metric=params['metric'])

        time_start = time.time()
        clf.fit(X_train, y_train)
        pred = clf.predict_proba(X_test)
        pred = pd.DataFrame([proba_pair[:, 1] for proba_pair in pred]).T
        print('Training and prediction done! Time elapsed: \
              {} seconds'.format(time.time() - time_start))

        ranking = get_ranking(y_test, pred)
        tp_cmpd = true_positive_per_compound(ranking)[9]
        tp_all = true_positives_recovered(ranking)[9]

        micro_ap_score = skm.average_precision_score(y_test,
                                                     pred,
                                                     average='micro')
        macro_ap_score = macro_ap(y_test, pred)

        coverage = skm.coverage_error(y_test, pred)

        micro_auroc_score = skm.roc_auc_score(y_test, pred, average='micro')
        macro_auroc_score = macro_auroc(y_test, pred)

        scores = pd.DataFrame()
        scores['proba'] = np.array(pred).flatten()
        scores['active'] = np.array(y_test).flatten()
        scores.sort_values(by='proba', ascending=False, inplace=True)

        micro_bedroc_score = Scoring.CalcBEDROC(np.array(scores),
                                                col=1,
                                                alpha=20)
        macro_bedroc_score = macro_bedroc(y_test, pred)

        results.append([
            micro_auroc_score, macro_auroc_score, tp_cmpd, tp_all,
            micro_ap_score, macro_ap_score, micro_bedroc_score,
            macro_bedroc_score, coverage
        ] + list(params.values()))

    results = pd.DataFrame(results)
    results.columns = ['micro_AUROC', 'macro_AUROC', 'Frac_1_in_top10',
                       'Frac_all_in_top10', 'micro_AP', 'macro_AP',
                       'micro_BEDROC', 'macro_BEDROC', 'coverage'] \
                       + list(params)
    results.to_csv(args['output'][0] + '/' + 'KNN_opt_results.csv',
                   index=False)
 def calculate(self, score, index):
     tmp = []
     for p in self.params:
         tmp.append(Scoring.CalcBEDROC(score, index, p))
     return tmp
Exemple #10
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate prediction results')
    parser.add_argument('-P',
                        '--pred_data',
                        action='store',
                        nargs='*',
                        dest='P',
                        help='Predicted targets for model (.csv format)')
    parser.add_argument('-y',
                        '--y_data',
                        action='store',
                        nargs='*',
                        dest='y',
                        help='Known target values (.csv format)')
    parser.add_argument('-i',
                        '--input_directory',
                        action='store',
                        nargs=1,
                        dest='input',
                        default=['./'],
                        help='Directory where input files are stored')
    parser.add_argument('-o',
                        '--output_directory',
                        action='store',
                        nargs=1,
                        dest='output',
                        default=['./'],
                        help='Directory where output files should be written')
    args = vars(parser.parse_args())

    #Sort P arguements passed to keep result order consistent
    args['P'].sort()

    #Loop through all predictions to evaluate
    for i in range(len(args['y'])):
        name = args['y'][i].split('_')[1]
        name = name.split('.')[0]
        y = pd.read_csv(args['input'][0] + args['y'][i])

        #Collect predictions for corresponding dataset - e.g. train, test
        name_index = [j for j, s in enumerate(args['P']) if name in s.lower()]

        #Generate dictionary to store predictions
        predictions = {}
        for j in name_index:
            pred = pd.read_csv(args['input'][0] + args['P'][j])

            #Get classifier from file name
            clf_name = args['P'][j].split('.')[0]
            clf_name = clf_name.split('_')[1]

            #Check for predictions which don't have the correct dimensions
            #This handles cases in which feature dimenions  were used in
            #stacking that have different dimensions - e.g. MLP hidden layer
            if len(pred.columns) == len(y.columns):
                #Store classifer name and values in dict
                predictions[clf_name] = pred

        #Get values of base classifier predictions and compute mean predictions
        pred_base = [
            df.values for key, df in predictions.items()
            if key not in ['stack']
        ]
        if pred_base:
            average_values = sum(pred_base) / len(pred_base)
            predictions['ConsensusAverage'] = pd.DataFrame(average_values)

        results = []
        for clf in predictions:
            pred = predictions[clf]

            ranking = get_ranking(y, pred)
            tp_cmpd = true_positive_per_compound(ranking)[9]
            tp_all = true_positives_recovered(ranking)[9]

            micro_ap_score = skm.average_precision_score(y,
                                                         pred,
                                                         average='micro')
            macro_ap_score = macro_ap(y, pred)

            coverage = skm.coverage_error(y, pred)

            micro_auroc_score = skm.roc_auc_score(y, pred, average='micro')
            macro_auroc_score = macro_auroc(y, pred)

            scores = pd.DataFrame()
            scores['proba'] = np.array(pred).flatten()
            scores['active'] = np.array(y).flatten()
            scores.sort_values(by='proba', ascending=False, inplace=True)

            micro_bedroc_score = Scoring.CalcBEDROC(np.array(scores),
                                                    col=1,
                                                    alpha=20)
            macro_bedroc_score = macro_bedroc(y, pred)

            results.append([
                clf, micro_auroc_score, macro_auroc_score, tp_cmpd, tp_all,
                micro_ap_score, macro_ap_score, micro_bedroc_score,
                macro_bedroc_score, coverage
            ])

        results = pd.DataFrame(results)
        results.columns = [
            'Model', 'micro_AUROC', 'macro_AUROC', 'Frac_1_in_top10',
            'Frac_all_in_top10', 'micro_AP', 'macro_AP', 'micro_BEDROC',
            'macro_BEDROC', 'coverage'
        ]
        print(results)
        results.to_csv(args['output'][0] + '/' + name + '_results.csv',
                       index=False)
Exemple #11
0
def calcularBEDROC(llistaTuplesOrdenada):

    llista_scores = [(1 - el[1], el[2]) for el in llistaTuplesOrdenada]
    bedroc = Scoring.CalcBEDROC(llista_scores, 1, 20)

    return bedroc