Ejemplo n.º 1
0
def logreg_results_to_pandas(common_molids_cache=False):
    """Collects all the results in disk and place them in record-format in a pandas dataframe.
    Allows convenient reporting, grouping and filtering of results.
    """
    results = ResultInDisk.collect_results_under_dir(MALARIA_LOGREGS_EXPERIMENT_ROOT,
                                                     factory=malaria_result_factory)

    # --- molids cache
    molids_cache = None
    if common_molids_cache:
        rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None)
        # Labelled molids
        lab_molids = rf_lab.ids()
        amb_molids = rf_amb.ids()  # To prioritize confirmatory tests on labelled data
        # Unlabelled molids
        unl_molids = rf_unl.ids()
        scr_molids = rf_scr.ids()
        # Let's avoid the need to reread them...
        molids_cache = {
            'lab': lab_molids,
            'amb': amb_molids,
            'unl': unl_molids,
            'scr': scr_molids
        }

    results_dict_of_dicts = {}
    for result in results:
        if common_molids_cache:
            result.ids_cache = molids_cache    # dodgy, rework with a copying constructor
        rdict = copy(result.info())
        rdict['result'] = result
        rdict['class_weight'] = 'uniform' if rdict['class_weight'] is None else rdict['class_weight']
        # Some more ad-hoc keys for the model
        rdict['num_present_folds'] = result.num_present_folds()
        rdict['auc_mean'] = result.auc_mean()
        rdict['enrichement5_mean'] = result.enrichement5_mean()
        # Some more ad-hoc keys for the fingerprint folder
        folder = result.fingerprint_folder()
        rdict['folder_seed'] = int(folder.seed) if folder is not None else -1
        rdict['folder_size'] = int(folder.fold_size) if folder is not None else 0
        # Add this result to the data frame
        results_dict_of_dicts[result.root_key()] = rdict

    return DataFrame(results_dict_of_dicts).T
Ejemplo n.º 2
0
def trees_results_to_pandas(common_molids_cache=False):
    """Collects all the results in disk and place them in record-format in a pandas dataframe.
    Allows convenient reporting, grouping and filtering of results.
    """
    results = ResultInDisk.collect_results_under_dir(MALARIA_TREES_EXPERIMENT_ROOT,
                                                     factory=malaria_result_factory)

    # --- molids cache
    molids_cache = None
    if common_molids_cache:
        a_result = results[0]
        # Labelled molids
        lab_molids = a_result.ids('lab')
        amb_molids = a_result.ids('amb')  # To prioritize confirmatory tests on labelled data
        # Unlabelled molids
        unl_molids = a_result.ids('unl')
        scr_molids = a_result.ids('scr')
        # Let's avoid the need to reread them...
        molids_cache = {
            'lab': lab_molids,
            'amb': amb_molids,
            'unl': unl_molids,
            'scr': scr_molids
        }

    results_dict_of_dicts = {}
    for result in results:
        if common_molids_cache:
            result.ids_cache = molids_cache    # dodgy, rework with a copying constructor
        rdict = copy(result.info())
        rdict['result'] = result
        # Some more ad-hoc keys for the model
        model_params = split_by(result.model_setup_id())[1]
        rdict['model_num_trees'] = int(model_params['num_trees'])
        rdict['model_seed'] = int(model_params['seed'])
        rdict['model_type'] = 'ExtraTrees' if model_params['etc'] else 'RandomForest'
        # Add this result to the data frame
        results_dict_of_dicts[result.root_key()] = rdict

    return DataFrame(results_dict_of_dicts).T
Ejemplo n.º 3
0
def clean_results_pre_infojson_bug_fix():
    results = ResultInDisk.collect_results_under_dir(MALARIA_LOGREGS_EXPERIMENT_ROOT,
                                                     factory=malaria_result_factory)
    bad_results = [res for res in results if not op.isfile(op.join(res.eval_dir, 'info.json'))]
    for res in bad_results:
        info('Bye %s' % res.eval_dir)
Ejemplo n.º 4
0
def trees_molids(dset='lab'):
    # No need to do this on a per-result basis because
    # atm we are warranted that they are the same accross all evaluations.
    a_result = ResultInDisk.collect_results_under_dir(MALARIA_TREES_EXPERIMENT_ROOT)[0]
    return a_result.ids(dset=dset)