Beispiel #1
0
def __run_rf_single(df_x, s_y, ntree, mtry, node_size, mode, seed, set_label,
                    plot):
    """Runs random forest once."""

    my_y = s_y
    if mode == 'classification':
        my_y = vectors.FactorVector(s_y)
    r('set.seed(' + str(seed) + ')')
    random_forest = importr('randomForest')
    data_rf = random_forest.randomForest(x=df_x,
                                         y=my_y,
                                         ntree=ntree,
                                         mtry=mtry,
                                         nodesize=node_size,
                                         importance=rinterface.TRUE,
                                         keep_forest=rinterface.TRUE,
                                         proximity=rinterface.TRUE)

    # TODO do we use these plots?
    # previous implementation would produce only one set of plots for
    # all iterations (the rest were overwritten due to filename
    # collisions), which makes me suspect we don't use them
    # if plot:
    #    plot.standard(data_rf, s_y, set_label)

    # data_rf is a list; we need the member named importance
    # if we convert it directly to a pd.DataFrame, we lose the col and row
    # names for some reason, so convert first to a rpy2 Matrix, then
    # copy the col and row names from there
    m_vim = data_rf.rx2('importance')
    df_vim = pandas2ri.ri2py_dataframe(data_rf.rx2('importance'))
    df_vim.columns = m_vim.colnames
    # restore original names, which may have been mangled by R
    df_vim['row.names'] = df_x.columns.values
    # df_vim has one column "row.names" then...
    if mode == 'classification':
        # one column for each class, then "MeanDecreaseAccuracy"
        # and "MeanDecreaseGini"
        df_vim = df_vim[['row.names', 'MeanDecreaseAccuracy']]
    else:
        # "%IncMSE" and "IncNodePurity"
        df_vim = df_vim[['row.names', 'IncNodePurity']]
    df_vim.columns = ['Feature.Name', 'vim']
    return df_vim
Beispiel #2
0
import pytest
from rpy2.robjects import vectors
from rpy2.robjects.packages import importr
from rpy2.ipython import html

base = importr('base')


@pytest.mark.parametrize(
    'o,func', [(vectors.IntVector([1, 2, 3]), html.html_vector_horizontal),
               (vectors.FloatVector([1, 2, 3]), html.html_vector_horizontal),
               (vectors.StrVector(['a', 'b'
                                   'c']), html.html_vector_horizontal),
               (vectors.FactorVector(['a', 'b'
                                      'c']), html.html_vector_horizontal),
               (vectors.ListVector({
                   'a': 1,
                   'b': 2
               }), html.html_rlist),
               (vectors.DataFrame({
                   'a': 1,
                   'b': 'z'
               }), html.html_rdataframe),
               ('x <- c(1, 2, 3)', html.html_sourcecode),
               (base.c, html.html_ridentifiedobject)])
def test_html_func(o, func):
    res = func(o)
    assert isinstance(res, str)
Beispiel #3
0
def process_refuse_output(refuse_dir, feature_metadata, group_metadata, df_x,
                          s_y, mode, data_label, ntree, mtry, iss, outer_reps,
                          inner_reps):

    run_info = data_label + "\nNtree: " + str(ntree) + "\nMtry: " + \
        str(mtry) + "\nISS: " + str(iss) + "\nOuter: " + str(outer_reps)

    data_results = None
    parameter_search_stats = None
    parameter_search_storage = None
    refuse_table = None
    vim_reps = None

    for filename in os.listdir(refuse_dir):
        filepath = os.path.join(refuse_dir, filename)
        if filename == 'Data_Results.csv':
            data_results = pd.DataFrame.from_csv(filepath)
        elif filename == 'Parameter_search_stats.csv':
            parameter_search_stats = pd.DataFrame.from_csv(filepath)
        elif filename == 'Parameter_search_storage.csv':
            parameter_search_storage = pd.DataFrame.from_csv(filepath)
        elif filename == 'REFUSE_structured.csv':
            refuse_structured = pd.DataFrame.from_csv(filepath)
        elif filename == 'REFUSE_Table.csv':
            refuse_table = pd.DataFrame.from_csv(filepath)
        elif filename == 'vim_reps.csv':
            vim_reps = pd.DataFrame.from_csv(filepath)
        else:
            print 'WARNING: unexpected file ' + filename

    # make R load the vis script
    script_dir = os.path.dirname(os.path.realpath(__file__))
    ro.r.source(os.path.join(script_dir, 'vis.r'))

    # set the working directory for R
    ro.r.setwd(os.getcwd())

    # draw plots
    if parameter_search_stats is not None:
        ro.r['plot.parameter.search.stats'](parameter_search_stats)
    else:
        print "WARNING: Missing 'Parameter_search_stats.csv'"

    if parameter_search_storage is not None:
        ro.r['plot.parameter.search.growth'](parameter_search_storage)
    else:
        print "WARNING: Missing 'Parameter_search_storage.csv'"

    if data_results is not None:
        data_results = fst_utils.add_feature_metadata(data_results,
                                                      feature_metadata, 'name')
        ro.r['plot.compare.methods'](data_results)

        plot_importance = ro.r['plot.importance']
        plot_importance(vim=ro.FloatVector(data_results['vim']),
                        labels=ro.StrVector(data_results['name']),
                        group=ro.StrVector(data_results['segment3']),
                        main='Auto Segment 3')
        plot_importance(vim=ro.FloatVector(data_results['vim']),
                        labels=ro.StrVector(data_results['name']),
                        group=ro.StrVector(data_results['segment5']),
                        main='Auto Segment 5')
    else:
        print "WARNING: Missing 'Data_Results.csv'"

    if refuse_table is not None:
        refuse_table = fst_utils.add_feature_metadata(refuse_table,
                                                      feature_metadata)
        if mode == 'classification':
            threshold = 0.6
            keepers = refuse_table[refuse_table['p_val'] > threshold]
            if len(keepers) > 0:
                df_x_best = df_x[keepers.index.values]
                my_y = vectors.FactorVector(s_y)
                # make summary plots
                ro.r['plot.summary.vis'](df_x_best, my_y)
                # plot small multiples
                ro.r['small.multiples'](df_x_best, my_y)
            else:
                print "WARNING: 0 features with p_val > " + str(
                    threshold) + "; skipping summary plots"

        palette = ro.StrVector(group_metadata['color'].tolist())
        palette.names = ro.StrVector(group_metadata.index.tolist())
        # make stability plots
        plot_stability = ro.r['plot.stability']
        plot_stability(importance=ro.FloatVector(refuse_table['VIM_mean']),
                       stability=ro.FloatVector(refuse_table['VIM_var']),
                       labels=ro.StrVector(refuse_table.index.values),
                       factor_colour=ro.StrVector(refuse_table['group']),
                       palette=palette,
                       main="Feature Importance vs Sensitivity",
                       info=run_info)
        plot_stability(importance=ro.FloatVector(refuse_table['p_val']),
                       stability=ro.FloatVector(refuse_table['VIM_var']),
                       labels=ro.StrVector(refuse_table.index.values),
                       factor_colour=ro.StrVector(refuse_table['group']),
                       palette=palette,
                       main="Feature p-value vs Sensitivity",
                       xlabel="Likelihood that a Feature is Relevant",
                       info=run_info)

        # the following portion requires refuse_table and vim_reps
        if vim_reps is not None:
            # make box plots
            plot_importance_box = ro.r['plot.importance.box']
            # in addition to vim_reps DF, need to pass version of refuse_table
            vim_reps_supplement = refuse_table.loc[vim_reps.columns.values,
                                                   ['group', 'p_val']]
            vim_reps_supplement.sort(columns='p_val',
                                     ascending=False,
                                     inplace=True)
            samples = outer_reps * inner_reps
            if iss:
                samples *= len(df_x.index)
            plot_importance_box(
                df_vim_reps=vim_reps,
                df_supplement=vim_reps_supplement,
                palette=palette,
                title=data_label + ' Feature Importance Box Plot',
                type="box",
                info=run_info + "\n# of samples: " + str(samples))
        else:
            print "WARNING: Missing 'vim_reps.csv'"
    else:
        print "WARNING: Missing 'REFUSE_Table.csv'"