def macro_bedroc(y_true, y_pred, a=20): """ Helper function which calculates macro averaged BEDROC score using ML.Scoring.Scoring.CalcBEDROC from rdkit Args: y_true: DataFrame of known labels, each row corresponds to a compound and each column corresponds to a target label y_pred: DataFrame of predicted label probabilities, rows and columns should match known DataFrame a: alpha value for BEDROC calculation. NOTE:only scores computed using the same alpha value can be compared Returns: The macro averaged BEDROC score for the predicted labels """ bedroc_scores = [] for column in y_true: if np.sum(y_true[column]) != 0: scores = pd.DataFrame() scores['proba'] = np.array(y_pred[column]) scores['active'] = np.array(y_true[column]) scores.sort_values(by='proba', ascending=False, inplace=True) bedroc_scores.append( Scoring.CalcBEDROC(np.array(scores), col=1, alpha=a)) else: continue macro_bedroc_score = np.mean(bedroc_scores) return macro_bedroc_score
def test4(self): """ test BEDROC """ # best case bedroc = Scoring.CalcBEDROC(self.scoreBestCase, self.index, self.alpha) self.assertAlmostEqual(bedroc, 1.0, self.acc) # worst case bedroc = Scoring.CalcBEDROC(self.scoreWorstCase, self.index, self.alpha) self.assertAlmostEqual(bedroc, 0.0, self.acc) # empty list self.assertRaises(ValueError, Scoring.CalcBEDROC, self.scoreEmptyList, self.index, self.alpha) # alpha == 0.0 self.assertRaises(ValueError, Scoring.CalcBEDROC, self.scoreBestCase, self.index, 0.0) # all actives bedroc = Scoring.CalcBEDROC(self.scoreAllActives, self.index, self.alpha) self.assertEqual(bedroc, 1.0) # all decoys bedroc = Scoring.CalcBEDROC(self.scoreAllDecoys, self.index, self.alpha) self.assertEqual(bedroc, 0.0)
def metrics_for_target(pred, actual, mask): mask = np.array(mask, dtype=np.bool) masked_preds = pred.squeeze()[mask] order = np.flipud(np.argsort(masked_preds)) masked_oredered_actual = actual[mask][order] return Scoring.CalcEnrichment( masked_oredered_actual, 0, [.001, .005, .01, .05]) + [ Scoring.CalcAUC(masked_oredered_actual, 0), Scoring.CalcBEDROC(masked_oredered_actual, 0, 20) ]
def evaluate(activity_arr): auc = Scoring.CalcAUC(activity_arr, 0) print("AUC: ", auc) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) print("EF for 1%: ", ef[0]) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) print("EF for 5%: ", ef[0]) rie = Scoring.CalcRIE(activity_arr, 0, 100) print("RIE for 100: ", rie) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) print("BEDROC for 100: ", bedroc)
def evaluation(activity_arr: list, output_file: str): inputoutput_utils.create_parent_directory(output_file) auc = Scoring.CalcAUC(activity_arr, 0) ef1 = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) ef5 = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) rie = Scoring.CalcRIE(activity_arr, 0, 100) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) output = { "AUC": auc, "EF1": ef1[0], "EF5": ef5[0], "RIE": rie, "BEDROC": bedroc } with open(output_file, "w", encoding="utf-8") as stream: json.dump(output, stream)
def evaluation(activity_arr: list, output_file: str): with open(output_file, "w") as stream: auc = Scoring.CalcAUC(activity_arr, 0) stream.write("AUC: ") stream.write(str(auc)) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) stream.write("\nEF for 1%: ") stream.write(str(ef[0])) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) stream.write("\nEF for 5%: ") stream.write(str(ef[0])) rie = Scoring.CalcRIE(activity_arr, 0, 100) stream.write("\nRIE for 100: ") stream.write(str(rie)) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) stream.write("\nBEDROC for 100: ") stream.write(str(bedroc))
def process_results(target): header = [] print "###Checking target %s..." % target labels, rocs, results_dict = build_rocs(target, plot=not (writefile)) if labels == rocs == None: return None print "Data loaded" if not header: header = labels auc_row = [target] ef1_row = [target] ef1r_row = [target] bedroc_row = [target] for roc in rocs: auc_row.append(roc.auc()) ef1_row.append(get_EF(roc, 10, relative=True)) ef1r_row.append(get_EF(roc, 1, relative=True)) bedroc_row.append(Scoring.CalcBEDROC(roc.data, 0, 20)) return header, auc_row, ef1_row, ef1r_row, bedroc_row, target
def main(): parser = argparse.ArgumentParser(description='Tune KNeighborsClassifier') parser.add_argument('-X', '--X_data', action='store', nargs=2, dest='X', help='Input features for the model (.csv format)') parser.add_argument('-y', '--y_data', action='store', nargs=2, dest='y', help='Target outputs for the model (.csv format)') parser.add_argument('-i', '--input_directory', action='store', nargs=1, dest='input', default=['./'], help='Directory where input files are stored') parser.add_argument('-o', '--output_directory', action='store', nargs=1, dest='output', default=['./'], help='Directory where output files should be written') args = vars(parser.parse_args()) #Sort so that training and test data are in a predictable order args['X'].sort() args['y'].sort() X_train = pd.read_csv(args['input'][0] + args['X'][1]) \ .drop(columns=['smiles']) y_train = pd.read_csv(args['input'][0] + args['y'][1]) X_test = pd.read_csv(args['input'][0] + args['X'][0]) \ .drop(columns=['smiles']) y_test = pd.read_csv(args['input'][0] + args['y'][0]) # use a full grid over all parameters param_grid = { 'n_neighbors': [1, 5, 10], 'metric': ['minkowski', 'jaccard'] } results = [] for params in list(ParameterGrid(param_grid)): clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=params['n_neighbors'], metric=params['metric']) time_start = time.time() clf.fit(X_train, y_train) pred = clf.predict_proba(X_test) pred = pd.DataFrame([proba_pair[:, 1] for proba_pair in pred]).T print('Training and prediction done! Time elapsed: \ {} seconds'.format(time.time() - time_start)) ranking = get_ranking(y_test, pred) tp_cmpd = true_positive_per_compound(ranking)[9] tp_all = true_positives_recovered(ranking)[9] micro_ap_score = skm.average_precision_score(y_test, pred, average='micro') macro_ap_score = macro_ap(y_test, pred) coverage = skm.coverage_error(y_test, pred) micro_auroc_score = skm.roc_auc_score(y_test, pred, average='micro') macro_auroc_score = macro_auroc(y_test, pred) scores = pd.DataFrame() scores['proba'] = np.array(pred).flatten() scores['active'] = np.array(y_test).flatten() scores.sort_values(by='proba', ascending=False, inplace=True) micro_bedroc_score = Scoring.CalcBEDROC(np.array(scores), col=1, alpha=20) macro_bedroc_score = macro_bedroc(y_test, pred) results.append([ micro_auroc_score, macro_auroc_score, tp_cmpd, tp_all, micro_ap_score, macro_ap_score, micro_bedroc_score, macro_bedroc_score, coverage ] + list(params.values())) results = pd.DataFrame(results) results.columns = ['micro_AUROC', 'macro_AUROC', 'Frac_1_in_top10', 'Frac_all_in_top10', 'micro_AP', 'macro_AP', 'micro_BEDROC', 'macro_BEDROC', 'coverage'] \ + list(params) results.to_csv(args['output'][0] + '/' + 'KNN_opt_results.csv', index=False)
def calculate(self, score, index): tmp = [] for p in self.params: tmp.append(Scoring.CalcBEDROC(score, index, p)) return tmp
def main(): parser = argparse.ArgumentParser(description='Evaluate prediction results') parser.add_argument('-P', '--pred_data', action='store', nargs='*', dest='P', help='Predicted targets for model (.csv format)') parser.add_argument('-y', '--y_data', action='store', nargs='*', dest='y', help='Known target values (.csv format)') parser.add_argument('-i', '--input_directory', action='store', nargs=1, dest='input', default=['./'], help='Directory where input files are stored') parser.add_argument('-o', '--output_directory', action='store', nargs=1, dest='output', default=['./'], help='Directory where output files should be written') args = vars(parser.parse_args()) #Sort P arguements passed to keep result order consistent args['P'].sort() #Loop through all predictions to evaluate for i in range(len(args['y'])): name = args['y'][i].split('_')[1] name = name.split('.')[0] y = pd.read_csv(args['input'][0] + args['y'][i]) #Collect predictions for corresponding dataset - e.g. train, test name_index = [j for j, s in enumerate(args['P']) if name in s.lower()] #Generate dictionary to store predictions predictions = {} for j in name_index: pred = pd.read_csv(args['input'][0] + args['P'][j]) #Get classifier from file name clf_name = args['P'][j].split('.')[0] clf_name = clf_name.split('_')[1] #Check for predictions which don't have the correct dimensions #This handles cases in which feature dimenions were used in #stacking that have different dimensions - e.g. MLP hidden layer if len(pred.columns) == len(y.columns): #Store classifer name and values in dict predictions[clf_name] = pred #Get values of base classifier predictions and compute mean predictions pred_base = [ df.values for key, df in predictions.items() if key not in ['stack'] ] if pred_base: average_values = sum(pred_base) / len(pred_base) predictions['ConsensusAverage'] = pd.DataFrame(average_values) results = [] for clf in predictions: pred = predictions[clf] ranking = get_ranking(y, pred) tp_cmpd = true_positive_per_compound(ranking)[9] tp_all = true_positives_recovered(ranking)[9] micro_ap_score = skm.average_precision_score(y, pred, average='micro') macro_ap_score = macro_ap(y, pred) coverage = skm.coverage_error(y, pred) micro_auroc_score = skm.roc_auc_score(y, pred, average='micro') macro_auroc_score = macro_auroc(y, pred) scores = pd.DataFrame() scores['proba'] = np.array(pred).flatten() scores['active'] = np.array(y).flatten() scores.sort_values(by='proba', ascending=False, inplace=True) micro_bedroc_score = Scoring.CalcBEDROC(np.array(scores), col=1, alpha=20) macro_bedroc_score = macro_bedroc(y, pred) results.append([ clf, micro_auroc_score, macro_auroc_score, tp_cmpd, tp_all, micro_ap_score, macro_ap_score, micro_bedroc_score, macro_bedroc_score, coverage ]) results = pd.DataFrame(results) results.columns = [ 'Model', 'micro_AUROC', 'macro_AUROC', 'Frac_1_in_top10', 'Frac_all_in_top10', 'micro_AP', 'macro_AP', 'micro_BEDROC', 'macro_BEDROC', 'coverage' ] print(results) results.to_csv(args['output'][0] + '/' + name + '_results.csv', index=False)
def calcularBEDROC(llistaTuplesOrdenada): llista_scores = [(1 - el[1], el[2]) for el in llistaTuplesOrdenada] bedroc = Scoring.CalcBEDROC(llista_scores, 1, 20) return bedroc