def metrics_for_target(pred, actual, mask): mask = np.array(mask, dtype=np.bool) masked_preds = pred.squeeze()[mask] order = np.flipud(np.argsort(masked_preds)) masked_oredered_actual = actual[mask][order] return Scoring.CalcEnrichment( masked_oredered_actual, 0, [.001, .005, .01, .05]) + [ Scoring.CalcAUC(masked_oredered_actual, 0), Scoring.CalcBEDROC(masked_oredered_actual, 0, 20) ]
def evaluate(activity_arr): auc = Scoring.CalcAUC(activity_arr, 0) print("AUC: ", auc) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) print("EF for 1%: ", ef[0]) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) print("EF for 5%: ", ef[0]) rie = Scoring.CalcRIE(activity_arr, 0, 100) print("RIE for 100: ", rie) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) print("BEDROC for 100: ", bedroc)
def evaluation(activity_arr: list, output_file: str): inputoutput_utils.create_parent_directory(output_file) auc = Scoring.CalcAUC(activity_arr, 0) ef1 = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) ef5 = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) rie = Scoring.CalcRIE(activity_arr, 0, 100) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) output = { "AUC": auc, "EF1": ef1[0], "EF5": ef5[0], "RIE": rie, "BEDROC": bedroc } with open(output_file, "w", encoding="utf-8") as stream: json.dump(output, stream)
def test3(self): """ test area under the curve (AUC) of ROC """ # best case auc = Scoring.CalcAUC(self.scoreBestCase, self.index) self.assertAlmostEqual(auc, 1.0, self.acc) # worst case auc = Scoring.CalcAUC(self.scoreWorstCase, self.index) self.assertAlmostEqual(auc, 0.0, self.acc) # empty list self.assertRaises(ValueError, Scoring.CalcAUC, self.scoreEmptyList, self.index) # all actives auc = Scoring.CalcAUC(self.scoreAllActives, self.index) self.assertAlmostEqual(auc, 0.0, self.acc) # all decoys auc = Scoring.CalcAUC(self.scoreAllDecoys, self.index) self.assertAlmostEqual(auc, 0.0, self.acc)
def macro_bedroc(y_true, y_pred, a=20): """ Helper function which calculates macro averaged BEDROC score using ML.Scoring.Scoring.CalcBEDROC from rdkit Args: y_true: DataFrame of known labels, each row corresponds to a compound and each column corresponds to a target label y_pred: DataFrame of predicted label probabilities, rows and columns should match known DataFrame a: alpha value for BEDROC calculation. NOTE:only scores computed using the same alpha value can be compared Returns: The macro averaged BEDROC score for the predicted labels """ bedroc_scores = [] for column in y_true: if np.sum(y_true[column]) != 0: scores = pd.DataFrame() scores['proba'] = np.array(y_pred[column]) scores['active'] = np.array(y_true[column]) scores.sort_values(by='proba', ascending=False, inplace=True) bedroc_scores.append( Scoring.CalcBEDROC(np.array(scores), col=1, alpha=a)) else: continue macro_bedroc_score = np.mean(bedroc_scores) return macro_bedroc_score
def evaluation(activity_arr: list, output_file: str): with open(output_file, "w") as stream: auc = Scoring.CalcAUC(activity_arr, 0) stream.write("AUC: ") stream.write(str(auc)) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) stream.write("\nEF for 1%: ") stream.write(str(ef[0])) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) stream.write("\nEF for 5%: ") stream.write(str(ef[0])) rie = Scoring.CalcRIE(activity_arr, 0, 100) stream.write("\nRIE for 100: ") stream.write(str(rie)) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) stream.write("\nBEDROC for 100: ") stream.write(str(bedroc))
def test4(self): """ test BEDROC """ # best case bedroc = Scoring.CalcBEDROC(self.scoreBestCase, self.index, self.alpha) self.assertAlmostEqual(bedroc, 1.0, self.acc) # worst case bedroc = Scoring.CalcBEDROC(self.scoreWorstCase, self.index, self.alpha) self.assertAlmostEqual(bedroc, 0.0, self.acc) # empty list self.assertRaises(ValueError, Scoring.CalcBEDROC, self.scoreEmptyList, self.index, self.alpha) # alpha == 0.0 self.assertRaises(ValueError, Scoring.CalcBEDROC, self.scoreBestCase, self.index, 0.0) # all actives bedroc = Scoring.CalcBEDROC(self.scoreAllActives, self.index, self.alpha) self.assertEqual(bedroc, 1.0) # all decoys bedroc = Scoring.CalcBEDROC(self.scoreAllDecoys, self.index, self.alpha) self.assertEqual(bedroc, 0.0)
def test2(self): """ test RIE """ ratio = float(self.numActives) / self.numMol # best case RIEmax = ((1 - math.exp(-self.alpha*ratio)) / (1 - math.exp(-self.alpha))) / ratio rie = Scoring.CalcRIE(self.scoreBestCase, self.index, self.alpha) self.assertAlmostEqual(rie, RIEmax, self.acc) # worst case RIEmin = ((1 - math.exp(self.alpha*ratio)) / (1 - math.exp(self.alpha))) / ratio rie = Scoring.CalcRIE(self.scoreWorstCase, self.index, self.alpha) self.assertAlmostEqual(rie, RIEmin, self.acc) # empty list self.assertRaises(ValueError, Scoring.CalcRIE, self.scoreEmptyList, self.index, self.alpha) # alpha == 0 self.assertRaises(ValueError, Scoring.CalcRIE, self.scoreBestCase, self.index, 0.0) # all decoys rie = Scoring.CalcRIE(self.scoreAllDecoys, self.index, self.alpha) self.assertEqual(rie, 0.0)
def main(): args = parse_arguments() unp_id_list = [row[0] for row in read_csv(args.target_ids)] category_list = [row[1] for row in read_csv(args.target_ids)] plt.figure(figsize=(5, 5)) for unp_id, category in list(zip(unp_id_list, category_list)): print(unp_id) sub_dir_list = next(os.walk(os.path.join(args.result_dir)))[1] tprs = [] mean_fpr = np.linspace(0, 1, 100) for sub_dir in sub_dir_list: scores_dir = os.path.join(args.result_dir, sub_dir, '{}'.format(unp_id)) print(scores_dir) if os.path.isdir(scores_dir): os.chdir(scores_dir) if os.path.isdir(scores_dir): for filename in os.listdir(scores_dir): if filename.endswith('.csv'): print(filename) rows = read_csv(filename) scores = [] for row in rows: scores.append([row[0], int(row[1])]) fpr, tpr = Scoring.CalcROC(scores, 1) tpr = np.array(tpr) tprs.append(interp(mean_fpr, fpr, tpr)) if tprs: mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 if (category) == 'easy': plot_curve(fpr=mean_fpr, tpr=mean_tpr, color='blue') elif (category) == 'moderate': plot_curve(fpr=mean_fpr, tpr=mean_tpr, color='orange') elif (category) == 'hard': plot_curve(fpr=mean_fpr, tpr=mean_tpr, color='green') elif (category) == 'unfeasible': plot_curve(fpr=mean_fpr, tpr=mean_tpr, color='magenta') else: print("no info for target {}".format(unp_id)) plt.savefig(os.path.join(args.output_dir, 'avg_roc.png'))
def test1(self): """ test enrichment factor """ # best case enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, self.fractions) self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc) # worst case enrich = Scoring.CalcEnrichment(self.scoreWorstCase, self.index, self.fractions) self.assertAlmostEqual(enrich[0], 0.0, self.acc) # empty list self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreEmptyList, self.index, self.fractions) # all actives enrich = Scoring.CalcEnrichment(self.scoreAllActives, self.index, self.fractions) self.assertAlmostEqual(enrich[0], 1.0, self.acc) # all decoys enrich = Scoring.CalcEnrichment(self.scoreAllDecoys, self.index, self.fractions) self.assertEqual(enrich[0], 0.0) # fraction * numMol is smaller than 1 enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, self.fracSmall) self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc) # fraction list is empty self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, []) # fraction == 0.0 enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, [0.0]) self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc) # fraction < 0 self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, [-0.05]) # fraction > 1 self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, [1.5])
def main(): args = parse_arguments() unp_id_list = [row[0] for row in read_csv(args.target_ids)] category_list = [row[1] for row in read_csv(args.target_ids)] plt.figure(figsize=(5,5)) for unp_id, category in list(zip(unp_id_list, category_list)): scores_dir = os.path.join(args.result_dir, '{}'.format(unp_id)) if os.path.isdir(scores_dir): os.chdir(scores_dir) if os.path.isdir(scores_dir): for filename in os.listdir(scores_dir): if filename.endswith('.csv'): print(filename) rows = read_csv(filename) scores = [] for row in rows: scores.append([row[0], int(row[1])]) print(scores) fpr, tpr = Scoring.CalcROC(scores, 1) tpr = np.array(tpr) print(unp_id) if (category) == 'easy': plot_curve(fpr=fpr, tpr=tpr, color='blue') elif (category) =='moderate': plot_curve(fpr=fpr, tpr=tpr, color='orange') elif (category) == 'hard': plot_curve(fpr=fpr, tpr=tpr, color='green') elif (category) =='unfeasible': plot_curve(fpr=fpr, tpr=tpr, color='magenta') plt.text(0.57, 0.05, args.label) plt.savefig(os.path.join(args.output_dir, 'mlt_roc.png'))
def process_results(target): header = [] print "###Checking target %s..." % target labels, rocs, results_dict = build_rocs(target, plot=not (writefile)) if labels == rocs == None: return None print "Data loaded" if not header: header = labels auc_row = [target] ef1_row = [target] ef1r_row = [target] bedroc_row = [target] for roc in rocs: auc_row.append(roc.auc()) ef1_row.append(get_EF(roc, 10, relative=True)) ef1r_row.append(get_EF(roc, 1, relative=True)) bedroc_row.append(Scoring.CalcBEDROC(roc.data, 0, 20)) return header, auc_row, ef1_row, ef1r_row, bedroc_row, target
def main(): rows = read_csv('/home/amukhopadhyay/ligand_screener_testing/screening_scores.csv') scores = [] for row in rows: scores.append([row[0], int(row[1])]) #print(scores) rdkit methods #fractions = [0.01, 0.05, 0.1] #print(Scoring.CalcAUC(scores, 1)) #print(Scoring.CalcBEDROC(scores, 1, 20)) #print(Scoring.CalcEnrichment(scores, 1, fractions)) #print(Scoring.CalcRIE(scores, 1, 20)) #print((Scoring.CalcAUC(scores, 1))) #print((Scoring.CalcROC(scores, 1))) rank_stats = StatisticalDescriptors.RankStatistics(scores, activity_column=operator.itemgetter(1)) print(round(rank_stats.EF(0.01), 1)) print(round(rank_stats.EF(0.02), 1)) print(round(rank_stats.EF(0.05), 1)) print(round(rank_stats.EF(0.1), 1)) print(round(rank_stats.AUC(), 1)) print(round(rank_stats.BEDROC(alpha=20), 1)) print(round(rank_stats.RIE(alpha=20), 1)) fpr, tpr = Scoring.CalcROC(scores, 1) roc_auc = metrics.auc(fpr, tpr) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig('test_roc.png')
def main(): parser = argparse.ArgumentParser(description='Evaluate prediction results') parser.add_argument('-P', '--pred_data', action='store', nargs='*', dest='P', help='Predicted targets for model (.csv format)') parser.add_argument('-y', '--y_data', action='store', nargs='*', dest='y', help='Known target values (.csv format)') parser.add_argument('-i', '--input_directory', action='store', nargs=1, dest='input', default=['./'], help='Directory where input files are stored') parser.add_argument('-o', '--output_directory', action='store', nargs=1, dest='output', default=['./'], help='Directory where output files should be written') args = vars(parser.parse_args()) #Sort P arguements passed to keep result order consistent args['P'].sort() #Loop through all predictions to evaluate for i in range(len(args['y'])): name = args['y'][i].split('_')[1] name = name.split('.')[0] y = pd.read_csv(args['input'][0] + args['y'][i]) #Collect predictions for corresponding dataset - e.g. train, test name_index = [j for j, s in enumerate(args['P']) if name in s.lower()] #Generate dictionary to store predictions predictions = {} for j in name_index: pred = pd.read_csv(args['input'][0] + args['P'][j]) #Get classifier from file name clf_name = args['P'][j].split('.')[0] clf_name = clf_name.split('_')[1] #Check for predictions which don't have the correct dimensions #This handles cases in which feature dimenions were used in #stacking that have different dimensions - e.g. MLP hidden layer if len(pred.columns) == len(y.columns): #Store classifer name and values in dict predictions[clf_name] = pred #Get values of base classifier predictions and compute mean predictions pred_base = [ df.values for key, df in predictions.items() if key not in ['stack'] ] if pred_base: average_values = sum(pred_base) / len(pred_base) predictions['ConsensusAverage'] = pd.DataFrame(average_values) results = [] for clf in predictions: pred = predictions[clf] ranking = get_ranking(y, pred) tp_cmpd = true_positive_per_compound(ranking)[9] tp_all = true_positives_recovered(ranking)[9] micro_ap_score = skm.average_precision_score(y, pred, average='micro') macro_ap_score = macro_ap(y, pred) coverage = skm.coverage_error(y, pred) micro_auroc_score = skm.roc_auc_score(y, pred, average='micro') macro_auroc_score = macro_auroc(y, pred) scores = pd.DataFrame() scores['proba'] = np.array(pred).flatten() scores['active'] = np.array(y).flatten() scores.sort_values(by='proba', ascending=False, inplace=True) micro_bedroc_score = Scoring.CalcBEDROC(np.array(scores), col=1, alpha=20) macro_bedroc_score = macro_bedroc(y, pred) results.append([ clf, micro_auroc_score, macro_auroc_score, tp_cmpd, tp_all, micro_ap_score, macro_ap_score, micro_bedroc_score, macro_bedroc_score, coverage ]) results = pd.DataFrame(results) results.columns = [ 'Model', 'micro_AUROC', 'macro_AUROC', 'Frac_1_in_top10', 'Frac_all_in_top10', 'micro_AP', 'macro_AP', 'micro_BEDROC', 'macro_BEDROC', 'coverage' ] print(results) results.to_csv(args['output'][0] + '/' + name + '_results.csv', index=False)
def run_ted(input_path, input_directory, prop, output_path): """ Loads .sdf file, converts the molecules into trees with graph annotations, runs the TED, evaluates the results and saves them into a file. :param input_path: :param input_directory: :param output_path: :return: """ with open(input_path) as input_stream: input_data = json.load(input_stream) # Load molecules and convert them to tree graphs. logging.info('Loading molecules ...') molecules = {} sizes = {} bondSizes = {} for file_item in input_data['files']: path = input_directory + file_item + '.sdf' logging.debug(path) if not os.path.exists(path): logging.error('Missing file: %s' % file_item) raise Exception('Missing file.') molecules.update(_load_molecules(path, sizes, bondSizes, prop)) # Screening. logging.info('Screening ...') scores = [] counter = 0 counter_max = len(input_data['data']['test']) counter_step = math.floor(counter_max / 100.0) + 1 time_begin = time.clock() for item in input_data['data']['test']: if item['name'] not in molecules: continue query = molecules[item['name']] query_size = sizes[item['name']] query_bonds = bondSizes[item['name']] # Count pairwise similarity with all actives and choose the maximum. maxsim = 0 for active in input_data['data']['train']['ligands']: if active['name'] not in molecules: continue active_graph = molecules[active['name']] active_size = sizes[active['name']] active_bonds = bondSizes[active['name']] ted = _ted(query, active_graph, prop) sim = 1.00 - ted / float(query_size + active_size + query_bonds + active_bonds) if (sim > maxsim): maxsim = sim minted = ted scores.append({ 'name': item['name'], 'similarity': maxsim, 'activity': item['activity'], 'ted': minted }) if counter % counter_step == 0: logging.debug('%d/%d', counter, counter_max) _flush_results(output_path, scores) counter += 1 logging.debug('counter: ' + str(counter)) time_end = time.clock() logging.debug("Reached the end.") # Evaluate screening. scores = sorted(scores, key=lambda m: m['similarity'], reverse=True) auc = Scoring.CalcAUC(scores, 'activity') ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05]) # Print results. print('AUC : ', auc) print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef) print('Execution time : %.2fs' % (time_end - time_begin)) # Write result to a file. if not output_path is None and not output_path == '': if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) with open(output_path, 'w') as output_stream: json.dump( { 'data': scores, 'metadata': { 'auc': auc, 'ef': { '0.005': ef[0], '0.01': ef[1], '0.02': ef[2], '0.05': ef[3] }, 'fileName': os.path.basename(__file__), 'executionTime': time_end - time_begin, 'definition': { 'selection': input_data['info']['selection'], 'molecules': input_data['info']['molecules'], 'index': input_data['info']['index'], 'dataset': input_data['info']['dataset'], 'method': input_data['info']['method'], 'config': 'config_file' } } }, output_stream, indent=2)
def calculate(self, score, index): tmp = [] for p in self.params: tmp.append(Scoring.CalcRIE(score, index, p)) return tmp
def calculate(self, score, index): return Scoring.CalcEnrichment(score, index, self.params)
def calculate(self, score, index): return Scoring.CalcAUC(score, index)
train_fps += [fps_inact[j] for j in train_indices_inact] ys_fit = [1] * len(train_indices_act) + [0] * len(train_indices_inact) # train the model ml = BernoulliNB() ml.fit(train_fps, ys_fit) # chemical similarity simil = cPickle.load(infile) # ranking test_fps = [fps_act[j] for j in test_indices_act[i]] test_fps += [fps_inact[j] for j in test_indices_inact[i]] scores = [[pp[1], s[0], s[1]] for pp, s in zip(ml.predict_proba(test_fps), simil)] # write ranks for actives cf.writeActiveRanks(scores, rankfile, num_actives) scores.sort(reverse=True) # evaluation auc = Scoring.CalcAUC(scores, -1) ef = Scoring.CalcEnrichment(scores, -1, [0.05]) # write out outfile.write("%i\t%.10f\t%.10f\n" % (i, auc, ef[0])) infile.close() rankfile.close() outfile.close()
test_fps = [fps_act_morgan2[j] for j in test_indices_act[i]] test_fps += [fps_inact_morgan2[j] for j in test_indices_inact[i]] scores_rf_morgan2 = [[ pp[1], s[0], s[1] ] for pp, s in zip(rf_morgan2.predict_proba(test_fps), simil)] # assign ranks scores_rf_rdk5 = cf.assignRanksWithInfo(scores_rf_rdk5) scores_lr_rdk5 = cf.assignRanksWithInfo(scores_lr_rdk5) scores_rf_morgan2 = cf.assignRanksWithInfo(scores_rf_morgan2) # fusion fusion_scores = [] for m1, m2, m3 in zip(scores_rf_rdk5, scores_lr_rdk5, scores_rf_morgan2): rank = max([m1[0], m2[0], m3[0]]) # max. rank proba = max([m1[1], m2[1], m3[1]]) # max. rank # store: [max rank, max proba, simil, info] fusion_scores.append([rank, proba, m1[2], m1[3]]) fusion_scores.sort(reverse=True) # evaluation auc = Scoring.CalcAUC(fusion_scores, -1) ef = Scoring.CalcEnrichment(fusion_scores, -1, [0.05]) # write out outfile.write("%i\t%.10f\t%.10f\n" % (i, auc, ef[0])) infile1.close() infile2.close() outfile.close()
def calcularBEDROC(llistaTuplesOrdenada): llista_scores = [(1 - el[1], el[2]) for el in llistaTuplesOrdenada] bedroc = Scoring.CalcBEDROC(llista_scores, 1, 20) return bedroc
def screening(input_dir, input_directory, config_file, output_path=None): """Perform a virtual screening. :param input_dir: path to input data (training and test in .json) :param input_directory: path to sdf files :param config_file: configuration file of mcs :param output_path: directory to save the results :return: """ with open(input_dir) as input_stream: input_data = json.load(input_stream) # Load molecules. logging.info('Loading molecules ...') molecules = {} for file_item in input_data['files']: path = input_directory + file_item + '.sdf' if not os.path.exists(path): logging.error('Missing file: %s' % file_item) raise Exception('Missing file.') molecules.update(_load_molecules(path)) # Create representation of active molecules. actives = [] for active in input_data['data']['train']['ligands']: if active['name'] not in molecules: continue actives.append(molecules[active['name']]) # Screening. logging.info('Screening ...') scores = [] counter = 0 inexact = 0 counter_max = len(input_data['data']['test']) counter_step = math.floor(counter_max / 100.0) + 1 params = mcsutils._parse_config(config_file) time_begin = time.clock() for item in input_data['data']['test']: if item['name'] not in molecules: continue query = molecules[item['name']] similarity = max([mcsutils._similarity(query, active, inexact, input_data['info'], params) for active in actives]) scores.append({ 'name': item['name'], 'similarity': similarity, 'activity': item['activity'] }) if counter % counter_step == 0: logging.debug('%d/%d', counter, counter_max) #_flush_results(output_path, scores) counter += 1 #logging.debug('counter: ' + str(counter)) time_end = time.clock() # Evaluate screening. scores = sorted(scores, key=lambda m: m['similarity'], reverse=True) auc = Scoring.CalcAUC(scores, 'activity') ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05]) # Print results. print('Input file: ', input_dir) print('Difficulty: ', input_directory) print('AUC : ', auc) print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef) print('Execution time : %.2fs' % (time_end - time_begin)) # Write result to a file. if not output_path is None and not output_path == '': if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) with open(output_path, 'w') as output_stream: json.dump({ 'data': scores, 'metadata': { 'auc': auc, 'ef': { '0.005': ef[0], '0.01': ef[1], '0.02': ef[2], '0.05': ef[3] }, 'fileName': os.path.basename(__file__), 'executionTime': time_end - time_begin, 'inexactMolecules': inexact, 'definition': { 'selection': input_data['info']['selection'], 'molecules': input_data['info']['molecules'], 'index': input_data['info']['index'], 'dataset': input_data['info']['dataset'], 'method': input_data['info']['method'], 'config': 'config_file' } } }, output_stream, indent=2)
def main(): parser = argparse.ArgumentParser(description='Tune KNeighborsClassifier') parser.add_argument('-X', '--X_data', action='store', nargs=2, dest='X', help='Input features for the model (.csv format)') parser.add_argument('-y', '--y_data', action='store', nargs=2, dest='y', help='Target outputs for the model (.csv format)') parser.add_argument('-i', '--input_directory', action='store', nargs=1, dest='input', default=['./'], help='Directory where input files are stored') parser.add_argument('-o', '--output_directory', action='store', nargs=1, dest='output', default=['./'], help='Directory where output files should be written') args = vars(parser.parse_args()) #Sort so that training and test data are in a predictable order args['X'].sort() args['y'].sort() X_train = pd.read_csv(args['input'][0] + args['X'][1]) \ .drop(columns=['smiles']) y_train = pd.read_csv(args['input'][0] + args['y'][1]) X_test = pd.read_csv(args['input'][0] + args['X'][0]) \ .drop(columns=['smiles']) y_test = pd.read_csv(args['input'][0] + args['y'][0]) # use a full grid over all parameters param_grid = { 'n_neighbors': [1, 5, 10], 'metric': ['minkowski', 'jaccard'] } results = [] for params in list(ParameterGrid(param_grid)): clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=params['n_neighbors'], metric=params['metric']) time_start = time.time() clf.fit(X_train, y_train) pred = clf.predict_proba(X_test) pred = pd.DataFrame([proba_pair[:, 1] for proba_pair in pred]).T print('Training and prediction done! Time elapsed: \ {} seconds'.format(time.time() - time_start)) ranking = get_ranking(y_test, pred) tp_cmpd = true_positive_per_compound(ranking)[9] tp_all = true_positives_recovered(ranking)[9] micro_ap_score = skm.average_precision_score(y_test, pred, average='micro') macro_ap_score = macro_ap(y_test, pred) coverage = skm.coverage_error(y_test, pred) micro_auroc_score = skm.roc_auc_score(y_test, pred, average='micro') macro_auroc_score = macro_auroc(y_test, pred) scores = pd.DataFrame() scores['proba'] = np.array(pred).flatten() scores['active'] = np.array(y_test).flatten() scores.sort_values(by='proba', ascending=False, inplace=True) micro_bedroc_score = Scoring.CalcBEDROC(np.array(scores), col=1, alpha=20) macro_bedroc_score = macro_bedroc(y_test, pred) results.append([ micro_auroc_score, macro_auroc_score, tp_cmpd, tp_all, micro_ap_score, macro_ap_score, micro_bedroc_score, macro_bedroc_score, coverage ] + list(params.values())) results = pd.DataFrame(results) results.columns = ['micro_AUROC', 'macro_AUROC', 'Frac_1_in_top10', 'Frac_all_in_top10', 'micro_AP', 'macro_AP', 'micro_BEDROC', 'macro_BEDROC', 'coverage'] \ + list(params) results.to_csv(args['output'][0] + '/' + 'KNN_opt_results.csv', index=False)
def screening(input_path, input_directory, ged_results_file, output_path=None): """Perform a virtual screening. :param input_path: input .json file with basic screening params :param input_directory: directory with .sdf files :param ged_results_file: .json file with GED results and parameters :param output_path: :return: """ with open(input_path) as input_stream: input_data = json.load(input_stream) # Load molecules. logging.info('Loading molecules ...') molecules = {} for file_item in input_data['files']: path = input_directory + file_item + '.sdf' if not os.path.exists(path): logging.error('Missing file: %s' % path) raise Exception('Missing file.') molecules.update(_load_molecules(path)) # Parse ged results file with open(ged_results_file) as ged_stream: ged_data = json.load(ged_stream) # Create representation of active molecules. actives = [] for active in input_data['data']['train']['ligands']: if active['name'] not in molecules: continue actives.append(molecules[active['name']]) # Screening. logging.info('Screening ...') scores = [] counter = 0 counter_max = len(input_data['data']['test']) counter_step = math.floor(counter_max / 100.0) + 1 time_begin = time.clock() for item in input_data['data']['test']: if item['name'] not in molecules: continue query = molecules[item['name']] # Counting similarity and searching for most similar active molecule similarity = 0 similarMol = query for active in actives: currentSimilarity = _ged_similarity(query, active, ged_data) if (currentSimilarity > similarity): similarity = currentSimilarity similarMol = active scores.append({ 'name': item['name'], 'similarity': similarity, 'activity': item['activity'], 'most-similar-active': similarMol.GetProp("_Name") }) #if (item['activity'] == 1) create_picture(query, similar-active) if counter % counter_step == 0: logging.debug('%d/%d', counter, counter_max) counter += 1 logging.debug('counter: ' + str(counter)) time_end = time.clock() # Evaluate screening. scores = sorted(scores, key=lambda m: m['similarity'], reverse=True) auc = Scoring.CalcAUC(scores, 'activity') ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05]) # Print results. print('AUC : ', auc) print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef) total_time = float(ged_data["properties"]["time"]) / 1000 total_time += (time_end - time_begin) print('Execution time : %.2fs' % total_time) # Write result to a file. if not output_path is None and not output_path == '': if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) with open(output_path, 'w') as output_stream: json.dump( { 'properties': ged_data["properties"], 'data': scores, 'metadata': { 'auc': auc, 'ef': { '0.005': ef[0], '0.01': ef[1], '0.02': ef[2], '0.05': ef[3] }, 'fileName': os.path.basename(__file__), 'executionTime': total_time, 'definition': { 'selection': input_data['info']['selection'], 'molecules': input_data['info']['molecules'], 'index': input_data['info']['index'], 'dataset': input_data['info']['dataset'], 'method': input_data['info']['method'], 'config': 'config_file' } } }, output_stream, indent=2)