def optimizor(molecules, sort_order, ensemble_size, options): """ Evaluate the performance of all ensembles of fixed size. """ # set variables ncpu = options.ncpu score_field = options.score_field # generate an exhaustive list of all possible ensembles ensemble_list = make_ensemble_list(molecules, score_field, ensemble_size) # set number of processors. if not ncpu: ncpu = multiprocessing.cpu_count() if ncpu > 1: print("Determining the performance of {d} ensembles using {n} processors".format(d=len(ensemble_list), n=ncpu)) if ncpu > len(ensemble_list): ncpu = len(ensemble_list) jobs = [] output_queue = multiprocessing.Queue() for ensemble_chunk in chunker(ensemble_list, ncpu): p = multiprocessing.Process(target=evaluate, args=(molecules, ensemble_chunk, sort_order, options, output_queue)) jobs.append(p) p.start() # collect results into a dictionary results = {} for i in range(len(jobs)): results.update(output_queue.get()) # stop jobs for j in jobs: j.join() else: print("Determining the performance of {d} ensembles using {n} processor".format(d=len(ensemble_list), n=ncpu)) results = evaluate(molecules, ensemble_list, sort_order, options) # peel away the best performing ensemble ensemble = screener.find_best_ensemble(results, options) # write out the best performing ensemble output.write_ensemble(list(ensemble), options) # temp 2/9/15 generate and return a list of auc values and ef at fpf = 0.001 to build up a histogram nd = max([results[x].ef.keys() for x in results.keys()][0]) n = int(round(0.001 * nd)) ef_list = [results[x].get_prop(n, 'ef') for x in results.keys()] auc_list = [results[x].get_prop('auc') for x in results.keys()] # auc_list = [[results[x].get_prop('auc'), results[x].get_prop('ensemble')] for x in results.keys()] return auc_list, ef_list
def optimizor(molecules, sort_order, ensemble_size, options): """ Evaluate the performance of all ensembles of fixed size. """ # set variables ncpu = options.ncpu score_field = options.score_field # generate an exhaustive list of all possible ensembles ensemble_list = make_ensemble_list(molecules, score_field, ensemble_size) # set number of processors. if not ncpu: ncpu = multiprocessing.cpu_count() if ncpu > 1: print(("Determining the performance of {d} ensembles using {n} processors".format(d=len(ensemble_list), n=ncpu))) if ncpu > len(ensemble_list): ncpu = len(ensemble_list) jobs = [] output_queue = multiprocessing.Queue() for ensemble_chunk in chunker(ensemble_list, ncpu): p = multiprocessing.Process(target=evaluate, args=(molecules, ensemble_chunk, sort_order, options, output_queue)) jobs.append(p) p.start() # collect results into a dictionary results = {} for i in range(len(jobs)): results.update(output_queue.get()) # stop jobs for j in jobs: j.join() else: print(("Determining the performance of {d} ensembles using {n} processor".format(d=len(ensemble_list), n=ncpu))) results = evaluate(molecules, ensemble_list, sort_order, options) # peel away the best performing ensemble ensemble = screener.find_best_ensemble(results, options) # write out the best performing ensemble output.write_ensemble(list(ensemble), options) # temp 2/9/15 generate and return a list of auc values and ef at fpf = 0.001 to build up a histogram nd = max([list(results[x].ef.keys()) for x in list(results.keys())][0]) n = int(round(0.001 * nd)) ef_list = [results[x].get_prop(n, 'ef') for x in list(results.keys())] auc_list = [results[x].get_prop('auc') for x in list(results.keys())] # auc_list = [[results[x].get_prop('auc'), results[x].get_prop('ensemble')] for x in results.keys()] return auc_list, ef_list
def rank_queries(molecules, ensemble, sort_order, options): """ rank queries by value added to existing ensemble :param molecules: :param score_field: :param ensemble: :param sort_order: :param options: :return: """ # generate query list query_list = [ x for x in list(molecules[0].scores.keys()) if x not in ensemble ] results = {} for query in query_list: es = EnsembleStorage() # an ensemble storage project # generate test_ensemble test_ensemble = ensemble[0:] test_ensemble.append(query) test_ensemble = tuple(test_ensemble) es.set_prop('ensemble', test_ensemble) # calculate its performance score_structure = classification.make_score_structure( molecules, test_ensemble) # determine auc value auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order, 'no stats') es.set_prop('auc', auc) # if the enrichment factor was set to anything other than 1, then we're training to maximize the corresponding # enrichment factor for fpf in classification.make_fpfList(options, score_structure): fpf = float(fpf) ef_structure = classification.make_ef_structure( score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf) es.set_prop(ef[0], ef[1], 'ef') # append results to metric list results[test_ensemble] = es # peel away the best performing ensemble best_ensemble = screener.find_best_ensemble(results, options) return list(best_ensemble)
def construct_ensemble(results, ensemble, options): """ :param results: {'(query)' : EB.builder.utilities.ensemble_storage.EnsembleStorage object} :param ensemble: :param options: :return: """ best_query = screener.find_best_ensemble(results, options) ensemble.append(best_query[0]) del results[best_query] return list(ensemble)
def rank_queries(molecules, ensemble, sort_order, options): """ rank queries by value added to existing ensemble :param molecules: :param score_field: :param ensemble: :param sort_order: :param options: :return: """ # generate query list query_list = [x for x in list(molecules[0].scores.keys()) if x not in ensemble] results = {} for query in query_list: es = EnsembleStorage() # an ensemble storage project # generate test_ensemble test_ensemble = ensemble[0:] test_ensemble.append(query) test_ensemble = tuple(test_ensemble) es.set_prop('ensemble', test_ensemble) # calculate its performance score_structure = classification.make_score_structure(molecules, test_ensemble) # determine auc value auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order, 'no stats') es.set_prop('auc', auc) # if the enrichment factor was set to anything other than 1, then we're training to maximize the corresponding # enrichment factor for fpf in classification.make_fpfList(options, score_structure): fpf = float(fpf) ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf) es.set_prop(ef[0], ef[1], 'ef') # append results to metric list results[test_ensemble] = es # peel away the best performing ensemble best_ensemble = screener.find_best_ensemble(results, options) return list(best_ensemble)