def test_pipeline_feature_selection():
    tag = selection_name = 'variance_selection'
    config = copy.deepcopy(BASE)
    with tmp_dirs_context(tag) as (train_path, predict_path, cwd):
        for idx, action in enumerate(config['run']):
            if 'train' in action or 'predict' in action:
                train_name = action.get('train', action.get('predict'))
                if 'pipeline' in action:
                    if not isinstance(action['pipeline'], (list, tuple)):
                        action['pipeline'] = config['pipelines'][
                            action['pipeline']]
                    action['pipeline'] += [{
                        'feature_selection': selection_name
                    }]
                else:
                    action['pipeline'] = [{
                        'feature_selection': selection_name
                    }]

                config2 = ConfigParser(config=BASE)
                config2.feature_selection[selection_name] = {
                    'method': 'VarianceThreshold',
                    'score_func': None,
                    'threshold': 0.08,
                }
                X = sampler()
                steps = pipeline.make_pipeline_steps(config2,
                                                     action['pipeline'])
                pipe = Pipeline(steps)
                transform_models = None
                for repeats in range(5):
                    XX, _, _ = pipe.fit_transform(X)
                    assert XX.flat.shape[1] < 40
Exemple #2
0
def run_one_config(args=None,
                   sys_argv=None,
                   return_0_if_ok=True,
                   config_dict=None,
                   client=None,
                   started=None):

    started = started or datetime.datetime.now()
    args = args or cli(args=None, sys_argv=sys_argv)
    config_dict = config_dict or args.config
    with try_finally_log_etime(started) as _:
        config = ConfigParser(config_dict, cmd_args=args)
        if client is None:
            if args.echo_config:
                logger.info(str(config))
            dask_client = getattr(config, 'DASK_CLIENT', 'SERIAL')
            dask_scheduler = getattr(config, 'DASK_SCHEDULER', None)
            with warnings.catch_warnings():
                # scikit-learn has a number
                # of deprecation warnings for kmeans
                warnings.simplefilter("ignore")
                with client_context(dask_client, dask_scheduler) as client:
                    return_values = parse_run_config(config, client)
        else:
            return_values = parse_run_config(config, client)

    if return_0_if_ok:
        return 0
    return return_values
Exemple #3
0
def tst_one_pipeline(pipeline,
                     add_na_per_band=0,
                     na_fields_as_str=True,
                     delim='_'):
    from elm.sample_util.sample_pipeline import make_pipeline_steps
    sample = random_elm_store()
    if add_na_per_band:
        for idx, band in enumerate(sample.data_vars):
            band_arr = getattr(sample, band)
            val = band_arr.values
            inds = np.arange(val.size)
            np.random.shuffle(inds)
            x = inds // val.shape[0]
            y = inds % val.shape[0]
            slc = slice(None, add_na_per_band // 2)
            val[y[slc],x[slc]] = 99 * idx
            band_arr.attrs['missing{}value'.format(delim)] = 99 * idx
            slc = slice(add_na_per_band // 2, add_na_per_band)
            val[y[slc], x[slc]] = 199 * idx
            band_arr.attrs['invalid{}range'.format(delim)] = [198 * idx, 200 * idx]
            band_arr.attrs['valid{}range'.format(delim)] = [-1e12, 1e12]
            if na_fields_as_str:
                for field in ('missing{}value', 'invalid{}range', 'valid{}range'):
                    field = field.format(delim)
                    v = band_arr.attrs[field]
                    if isinstance(v, list):
                        band_arr.attrs[field] = ', '.join(map(str,v))
                    else:
                        band_arr.attrs[field] = str(v)
            assert val[np.isnan(val)].size == 0
    config = ConfigParser(config=make_config(pipeline, data_source))
    pipe = Pipeline(make_pipeline_steps(config, pipeline))
    new_es = pipe.fit_transform(sample)
    return sample, new_es[0]
Exemple #4
0
def tst_bad_config(bad_config):
    tmp = None
    try:
        tmp, config_file = dump_config(bad_config)
        with pytest.raises(ElmConfigError):
            ConfigParser(config_file_name=config_file)
    finally:
        if tmp and os.path.exists(tmp):
            shutil.rmtree(tmp)
    # return the ok version for next test
    ok_config = copy.deepcopy(DEFAULTS)
    tmp, config_file = dump_config(ok_config)
    try:
        ConfigParser(config_file)  # confirm it is okay for the next test
    finally:
        shutil.rmtree(tmp)
    return ok_config
Exemple #5
0
def set_key_tst_bad_config_once(key, bad):
    config2 = yaml.load(CONFIG_STR)
    d = config2
    for k in key[:-1]:
        d = d[k]
    d[key[-1]] = bad
    with pytest.raises(ElmConfigError):
        ConfigParser(config=config2)
Exemple #6
0
def _setup(config=None):
    '''Return the config above and the param_grid'''
    from elm.sample_util.sample_pipeline import make_pipeline_steps
    from elm.pipeline import Pipeline
    if not config:
        config = ConfigParser(config=yaml.load(CONFIG_STR))
    sample_steps = make_pipeline_steps(config, config.run[0]['pipeline'])
    estimator = [('kmeans', MiniBatchKMeans(**config.train['kmeans']['model_init_kwargs']))]
    pipe = Pipeline(sample_steps + estimator)
    idx_to_param_grid = ea_setup(config)
    return config, pipe, idx_to_param_grid
Exemple #7
0
def test_ea_general(fitnesses, score_weights):
    '''This test ensures that ea_general, a general
    EA can minimize, maximize or
    handle multiple objectives.  In each of the fitnesses
    sequences passed in, the 2nd individual is known
    to be the most fit (given corresponding score_weights
    which determine whether min/maximizing the objective(s))
    '''
    config = yaml.load(CONFIG_STR)
    config['model_scoring']['testing_model_scoring'][
        'score_weights'] = score_weights
    config['param_grids']['example_param_grid']['control']['early_stop'] = {
        'abs_change': [
            100,
        ] * len(score_weights)
    }
    config, evo_params = tst_evo_setup_evo_init_func(config=ConfigParser(
        config=config))
    control = evo_params.deap_params['control']
    param_history = []
    ea_gen = ea_general(evo_params, control['cxpb'], control['mutpb'],
                        control['ngen'], control['k'])

    pop, _, _ = next(ea_gen)
    for ind in pop:
        assert isinstance(ind, list)
    invalid_ind = pop
    assert len(pop) == control['mu']
    original_pop = copy.deepcopy(pop)
    best = fitnesses[1]  # in this synthetic data,
    # the 2nd param set is always best
    assign_check_fitness(pop, fitnesses, param_history,
                         evo_params.deap_params['choices'],
                         evo_params.score_weights)
    while invalid_ind:
        (pop, invalid_ind, param_history) = ea_gen.send(fitnesses)
    matches_best = tuple(ind for ind in pop if ind.fitness.values == best)
    assert matches_best
    assert original_pop != pop