def test_pipeline_feature_selection(): tag = selection_name = 'variance_selection' config = copy.deepcopy(BASE) with tmp_dirs_context(tag) as (train_path, predict_path, cwd): for idx, action in enumerate(config['run']): if 'train' in action or 'predict' in action: train_name = action.get('train', action.get('predict')) if 'pipeline' in action: if not isinstance(action['pipeline'], (list, tuple)): action['pipeline'] = config['pipelines'][ action['pipeline']] action['pipeline'] += [{ 'feature_selection': selection_name }] else: action['pipeline'] = [{ 'feature_selection': selection_name }] config2 = ConfigParser(config=BASE) config2.feature_selection[selection_name] = { 'method': 'VarianceThreshold', 'score_func': None, 'threshold': 0.08, } X = sampler() steps = pipeline.make_pipeline_steps(config2, action['pipeline']) pipe = Pipeline(steps) transform_models = None for repeats in range(5): XX, _, _ = pipe.fit_transform(X) assert XX.flat.shape[1] < 40
def run_one_config(args=None, sys_argv=None, return_0_if_ok=True, config_dict=None, client=None, started=None): started = started or datetime.datetime.now() args = args or cli(args=None, sys_argv=sys_argv) config_dict = config_dict or args.config with try_finally_log_etime(started) as _: config = ConfigParser(config_dict, cmd_args=args) if client is None: if args.echo_config: logger.info(str(config)) dask_client = getattr(config, 'DASK_CLIENT', 'SERIAL') dask_scheduler = getattr(config, 'DASK_SCHEDULER', None) with warnings.catch_warnings(): # scikit-learn has a number # of deprecation warnings for kmeans warnings.simplefilter("ignore") with client_context(dask_client, dask_scheduler) as client: return_values = parse_run_config(config, client) else: return_values = parse_run_config(config, client) if return_0_if_ok: return 0 return return_values
def tst_one_pipeline(pipeline, add_na_per_band=0, na_fields_as_str=True, delim='_'): from elm.sample_util.sample_pipeline import make_pipeline_steps sample = random_elm_store() if add_na_per_band: for idx, band in enumerate(sample.data_vars): band_arr = getattr(sample, band) val = band_arr.values inds = np.arange(val.size) np.random.shuffle(inds) x = inds // val.shape[0] y = inds % val.shape[0] slc = slice(None, add_na_per_band // 2) val[y[slc],x[slc]] = 99 * idx band_arr.attrs['missing{}value'.format(delim)] = 99 * idx slc = slice(add_na_per_band // 2, add_na_per_band) val[y[slc], x[slc]] = 199 * idx band_arr.attrs['invalid{}range'.format(delim)] = [198 * idx, 200 * idx] band_arr.attrs['valid{}range'.format(delim)] = [-1e12, 1e12] if na_fields_as_str: for field in ('missing{}value', 'invalid{}range', 'valid{}range'): field = field.format(delim) v = band_arr.attrs[field] if isinstance(v, list): band_arr.attrs[field] = ', '.join(map(str,v)) else: band_arr.attrs[field] = str(v) assert val[np.isnan(val)].size == 0 config = ConfigParser(config=make_config(pipeline, data_source)) pipe = Pipeline(make_pipeline_steps(config, pipeline)) new_es = pipe.fit_transform(sample) return sample, new_es[0]
def tst_bad_config(bad_config): tmp = None try: tmp, config_file = dump_config(bad_config) with pytest.raises(ElmConfigError): ConfigParser(config_file_name=config_file) finally: if tmp and os.path.exists(tmp): shutil.rmtree(tmp) # return the ok version for next test ok_config = copy.deepcopy(DEFAULTS) tmp, config_file = dump_config(ok_config) try: ConfigParser(config_file) # confirm it is okay for the next test finally: shutil.rmtree(tmp) return ok_config
def set_key_tst_bad_config_once(key, bad): config2 = yaml.load(CONFIG_STR) d = config2 for k in key[:-1]: d = d[k] d[key[-1]] = bad with pytest.raises(ElmConfigError): ConfigParser(config=config2)
def _setup(config=None): '''Return the config above and the param_grid''' from elm.sample_util.sample_pipeline import make_pipeline_steps from elm.pipeline import Pipeline if not config: config = ConfigParser(config=yaml.load(CONFIG_STR)) sample_steps = make_pipeline_steps(config, config.run[0]['pipeline']) estimator = [('kmeans', MiniBatchKMeans(**config.train['kmeans']['model_init_kwargs']))] pipe = Pipeline(sample_steps + estimator) idx_to_param_grid = ea_setup(config) return config, pipe, idx_to_param_grid
def test_ea_general(fitnesses, score_weights): '''This test ensures that ea_general, a general EA can minimize, maximize or handle multiple objectives. In each of the fitnesses sequences passed in, the 2nd individual is known to be the most fit (given corresponding score_weights which determine whether min/maximizing the objective(s)) ''' config = yaml.load(CONFIG_STR) config['model_scoring']['testing_model_scoring'][ 'score_weights'] = score_weights config['param_grids']['example_param_grid']['control']['early_stop'] = { 'abs_change': [ 100, ] * len(score_weights) } config, evo_params = tst_evo_setup_evo_init_func(config=ConfigParser( config=config)) control = evo_params.deap_params['control'] param_history = [] ea_gen = ea_general(evo_params, control['cxpb'], control['mutpb'], control['ngen'], control['k']) pop, _, _ = next(ea_gen) for ind in pop: assert isinstance(ind, list) invalid_ind = pop assert len(pop) == control['mu'] original_pop = copy.deepcopy(pop) best = fitnesses[1] # in this synthetic data, # the 2nd param set is always best assign_check_fitness(pop, fitnesses, param_history, evo_params.deap_params['choices'], evo_params.score_weights) while invalid_ind: (pop, invalid_ind, param_history) = ea_gen.send(fitnesses) matches_best = tuple(ind for ind in pop if ind.fitness.values == best) assert matches_best assert original_pop != pop