def logreg_results_to_pandas(common_molids_cache=False): """Collects all the results in disk and place them in record-format in a pandas dataframe. Allows convenient reporting, grouping and filtering of results. """ results = ResultInDisk.collect_results_under_dir(MALARIA_LOGREGS_EXPERIMENT_ROOT, factory=malaria_result_factory) # --- molids cache molids_cache = None if common_molids_cache: rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None) # Labelled molids lab_molids = rf_lab.ids() amb_molids = rf_amb.ids() # To prioritize confirmatory tests on labelled data # Unlabelled molids unl_molids = rf_unl.ids() scr_molids = rf_scr.ids() # Let's avoid the need to reread them... molids_cache = { 'lab': lab_molids, 'amb': amb_molids, 'unl': unl_molids, 'scr': scr_molids } results_dict_of_dicts = {} for result in results: if common_molids_cache: result.ids_cache = molids_cache # dodgy, rework with a copying constructor rdict = copy(result.info()) rdict['result'] = result rdict['class_weight'] = 'uniform' if rdict['class_weight'] is None else rdict['class_weight'] # Some more ad-hoc keys for the model rdict['num_present_folds'] = result.num_present_folds() rdict['auc_mean'] = result.auc_mean() rdict['enrichement5_mean'] = result.enrichement5_mean() # Some more ad-hoc keys for the fingerprint folder folder = result.fingerprint_folder() rdict['folder_seed'] = int(folder.seed) if folder is not None else -1 rdict['folder_size'] = int(folder.fold_size) if folder is not None else 0 # Add this result to the data frame results_dict_of_dicts[result.root_key()] = rdict return DataFrame(results_dict_of_dicts).T
def trees_results_to_pandas(common_molids_cache=False): """Collects all the results in disk and place them in record-format in a pandas dataframe. Allows convenient reporting, grouping and filtering of results. """ results = ResultInDisk.collect_results_under_dir(MALARIA_TREES_EXPERIMENT_ROOT, factory=malaria_result_factory) # --- molids cache molids_cache = None if common_molids_cache: a_result = results[0] # Labelled molids lab_molids = a_result.ids('lab') amb_molids = a_result.ids('amb') # To prioritize confirmatory tests on labelled data # Unlabelled molids unl_molids = a_result.ids('unl') scr_molids = a_result.ids('scr') # Let's avoid the need to reread them... molids_cache = { 'lab': lab_molids, 'amb': amb_molids, 'unl': unl_molids, 'scr': scr_molids } results_dict_of_dicts = {} for result in results: if common_molids_cache: result.ids_cache = molids_cache # dodgy, rework with a copying constructor rdict = copy(result.info()) rdict['result'] = result # Some more ad-hoc keys for the model model_params = split_by(result.model_setup_id())[1] rdict['model_num_trees'] = int(model_params['num_trees']) rdict['model_seed'] = int(model_params['seed']) rdict['model_type'] = 'ExtraTrees' if model_params['etc'] else 'RandomForest' # Add this result to the data frame results_dict_of_dicts[result.root_key()] = rdict return DataFrame(results_dict_of_dicts).T
def clean_results_pre_infojson_bug_fix(): results = ResultInDisk.collect_results_under_dir(MALARIA_LOGREGS_EXPERIMENT_ROOT, factory=malaria_result_factory) bad_results = [res for res in results if not op.isfile(op.join(res.eval_dir, 'info.json'))] for res in bad_results: info('Bye %s' % res.eval_dir)
def trees_molids(dset='lab'): # No need to do this on a per-result basis because # atm we are warranted that they are the same accross all evaluations. a_result = ResultInDisk.collect_results_under_dir(MALARIA_TREES_EXPERIMENT_ROOT)[0] return a_result.ids(dset=dset)