def test_pipeline_feature_selection(): tag = selection_name = 'variance_selection' config = copy.deepcopy(BASE) with tmp_dirs_context(tag) as (train_path, predict_path, cwd): for idx, action in enumerate(config['run']): if 'train' in action or 'predict' in action: train_name = action.get('train', action.get('predict')) if 'pipeline' in action: if not isinstance(action['pipeline'], (list, tuple)): action['pipeline'] = config['pipelines'][ action['pipeline']] action['pipeline'] += [{ 'feature_selection': selection_name }] else: action['pipeline'] = [{ 'feature_selection': selection_name }] config2 = ConfigParser(config=BASE) config2.feature_selection[selection_name] = { 'method': 'VarianceThreshold', 'score_func': None, 'threshold': 0.08, } X = sampler() steps = pipeline.make_pipeline_steps(config2, action['pipeline']) pipe = Pipeline(steps) transform_models = None for repeats in range(5): XX, _, _ = pipe.fit_transform(X) assert XX.flat.shape[1] < 40
def tst_one_pipeline(pipeline, add_na_per_band=0, na_fields_as_str=True, delim='_'): from elm.sample_util.sample_pipeline import make_pipeline_steps sample = random_elm_store() if add_na_per_band: for idx, band in enumerate(sample.data_vars): band_arr = getattr(sample, band) val = band_arr.values inds = np.arange(val.size) np.random.shuffle(inds) x = inds // val.shape[0] y = inds % val.shape[0] slc = slice(None, add_na_per_band // 2) val[y[slc],x[slc]] = 99 * idx band_arr.attrs['missing{}value'.format(delim)] = 99 * idx slc = slice(add_na_per_band // 2, add_na_per_band) val[y[slc], x[slc]] = 199 * idx band_arr.attrs['invalid{}range'.format(delim)] = [198 * idx, 200 * idx] band_arr.attrs['valid{}range'.format(delim)] = [-1e12, 1e12] if na_fields_as_str: for field in ('missing{}value', 'invalid{}range', 'valid{}range'): field = field.format(delim) v = band_arr.attrs[field] if isinstance(v, list): band_arr.attrs[field] = ', '.join(map(str,v)) else: band_arr.attrs[field] = str(v) assert val[np.isnan(val)].size == 0 config = ConfigParser(config=make_config(pipeline, data_source)) pipe = Pipeline(make_pipeline_steps(config, pipeline)) new_es = pipe.fit_transform(sample) return sample, new_es[0]
def test_kmeans_simple_X(client=None): pipe = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=6)]) fitted = pipe.fit_ensemble(X=X, **ENSEMBLE_KWARGS) _train_asserts(fitted, ENSEMBLE_KWARGS['saved_ensemble_size']) pred = fitted.predict_many(X=X) assert len(pred) == len(fitted.ensemble)
def test_kmeans_simple_sampler(client=None): pipe = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=6)]) kw = SAMPLER_DATA_SOURCE.copy() kw.update(ENSEMBLE_KWARGS) fitted = pipe.fit_ensemble(**kw) ens = fitted.ensemble _train_asserts(fitted, ENSEMBLE_KWARGS['saved_ensemble_size']) pred = fitted.predict_many(**SAMPLER_DATA_SOURCE) assert len(pred) == len(SAMPLER_DATA_SOURCE['args_list']) * len(ens)
def test_predict(): p = Pipeline(flat_poly_var_kmeans) # sample below is X, y, sample_weight sample = p.create_sample(**data_source) # fitted is a Pipeline instance (it returns self after fitting) fitted = p.fit(*sample) # this should be a numpy array pred = fitted.predict(*sample) assert isinstance(pred, np.ndarray)
def test_simple(): p = Pipeline([('a', steps.Flatten())]) # fit_transform should always return (X, y, sample_weight) X, y, sample_weight = p.fit_transform(**data_source) assert isinstance(X, ElmStore) assert hasattr(X, 'flat') assert y is None assert sample_weight is None
def test_feature_selection(feat_cls): pytest.xfail('This test doesnt test anything yet') step_cls = getattr(steps, feat_cls) init_kwargs = {} # come up with some initialization kwargs p = Pipeline( [steps.Flatten(), steps.ModifySample(get_y), step_cls(**init_kwargs)]) # X, y, sample_weight = p.fit_transform(**data_source)
def test_supervised_feat_select_X_y(client=None): '''Has a ModifySample step to get necessary y data''' pipe = Pipeline([steps.Flatten(), steps.SelectPercentile(score_func=f_classif, percentile=50), SGDClassifier()]) en = dict(method_kwargs=dict(classes=[0, 1, 2]), **ENSEMBLE_KWARGS) en.update(X_Y_DATA_SOURCE) fitted = pipe.fit_ensemble(**en) _train_asserts(fitted, en['saved_ensemble_size']) pred = fitted.predict_many(**X_Y_DATA_SOURCE) assert len(pred) == len(fitted.ensemble)
def test_simple(): p = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=5),]) args_list = [(100, 200, 5)] * 10 # (height, width, bands) data_source = dict(sampler=example_sampler, args_list=args_list) ensemble_kw = dict(ngen=2, init_ensemble_size=2) ensemble_kw.update(data_source) fitted = p.fit_ensemble(**ensemble_kw) tagged_fitted_models = fitted.ensemble (tag1, model1), (tag2, model2) = tagged_fitted_models # ensemble size of 2 here X = example_sampler(100, 400, 5) pred1 = model1.predict(X) pred2 = model2.predict(X) assert pred1.shape == pred2.shape == (400 * 100,)
def test_modify_sample(): '''steps.ModifySample should take any function and call it in Pipeline. The signature of the function should be: func(X, y=None, sample_weight=None, **kwargs) and it should return a tuple of: (X, y, sample_weight) ''' p = Pipeline([steps.Flatten(), steps.ModifySample(get_y)]) X, y, sample_weight = p.fit_transform(**data_source) assert X is not None assert isinstance(y, np.ndarray)
def _setup(config=None): '''Return the config above and the param_grid''' from elm.sample_util.sample_pipeline import make_pipeline_steps from elm.pipeline import Pipeline if not config: config = ConfigParser(config=yaml.load(CONFIG_STR)) sample_steps = make_pipeline_steps(config, config.run[0]['pipeline']) estimator = [('kmeans', MiniBatchKMeans(**config.train['kmeans']['model_init_kwargs']))] pipe = Pipeline(sample_steps + estimator) idx_to_param_grid = ea_setup(config) return config, pipe, idx_to_param_grid
def test_kmeans_model_selection(client=None): pipe = Pipeline([ steps.Flatten(), ('pca', steps.Transform(IncrementalPCA())), ('kmeans', MiniBatchKMeans(n_clusters=5)) ], scoring=kmeans_aic, scoring_kwargs={'score_weights': [-1]}) def samp(*args, **kwargs): return random_elm_store(bands=12, mn=0, mx=1, height=20, width=40) en = ENSEMBLE_KWARGS.copy() n_clusters_choices = list(range(3, 10)) def init(pipe, **kwargs): estimators = [] for _ in range(100): n_components = np.random.choice(np.arange(2, 6)) n_clusters = np.random.choice(n_clusters_choices) estimator = copy.deepcopy(pipe) estimator.set_params(kmeans__n_clusters=n_clusters, pca__n_components=n_components) estimators.append(estimator) return estimators en['ngen'] = 20 en['model_scoring'] = kmeans_aic en['ensemble_init_func'] = init en['model_selection_kwargs'] = dict(drop_n=30, evolve_n=30, choices=n_clusters_choices) en['model_selection'] = kmeans_model_averaging sa = SAMPLER_DATA_SOURCE.copy() sa['sampler'] = samp en.update(sa) fitted = pipe.fit_ensemble(**en) assert len(fitted.ensemble) == en['saved_ensemble_size'] preds = fitted.predict_many(**sa) assert len(preds) == len(fitted.ensemble) * len( SAMPLER_DATA_SOURCE['args_list'])
def test_set_params_get_params(): '''Assert setting with double underscore parameter names will work ok''' p = Pipeline(flat_poly_var_kmeans) kw = dict(kmeans__n_clusters=9, poly__interaction_only=False, var__threshold=1e-8) p.set_params(**kw) params = p.get_params() for k, v in kw.items(): assert k in params and params[k] == v with pytest.raises(ValueError): p.set_params(kmeans_n_clusters=9) # no double underscore
def load_pipe_from_tag(elm_train_path, tag): '''Calls Pipeline.load for a tagged saved Pipeline in elm_train_path Parameters: :elm_train_path: root dir for serializing training outputs :tag: tag that was given to elm.pipeline.serialize.serialize_pipe Returns: :elm.pipeline.Pipeline: instance (fitted if it was fitted before saving) ''' from elm.pipeline import Pipeline logger.debug('Load {} from {}'.format(tag, elm_train_path)) path = _get_path_for_tag(elm_train_path, tag) if not os.path.exists(path): raise IOError('Cannot load from {} (does not exist)'.format(path)) return Pipeline.load(path)
def test_pipeline_new_with_params(): p = Pipeline([ steps.SelectCanvas('band_1'), steps.Flatten(), ('pca', steps.Transform(IncrementalPCA(n_components=3))), ('kmeans', KMeans(n_clusters=4)) ]) p.fit(random_elm_store()) p.predict(random_elm_store()) assert p.steps[-1][-1].cluster_centers_.shape[0] == 4 p2 = p.new_with_params(kmeans__n_clusters=7, pca__n_components=2) with pytest.raises(NotFittedError): p2.predict(random_elm_store()) p2.fit(random_elm_store()) assert p2.steps[-1][-1].cluster_centers_.shape[0] == 7
def new_pipeline(*args, flatten_first=True): trans = [] for idx, model in enumerate(args): parts = model._cls.__name__.split('.') name = parts[-1] if any(part in SKIP for part in parts): pytest.skip('{} - not implemented'.format(model._cls.__name__)) out = get_params_for_est(model, name) if idx == 0: X, y, params, data_kw = out else: _, _, params, data_kw = out if 'score_func' in params: # some estimators require "score_func" # as an argument (and hence y in cases # where y may not be required by # other estimators in Pipeline instance) if y is None: val = X.to_features().features.values y = val.dot(np.random.uniform(0, 1, val.shape[1])) trans.append(('step_{}'.format(idx + 1), model(**params))) if data_kw['is_classifier']: y = (y > y.mean()).astype(np.int32) if flatten_first: # Add a step to convert first from MLDataset with # >=1 DataArrays to a single one with a # "features" DataArray - see "to_features" in # xarray_filters def to_feat(X, y=None): if hasattr(X, 'to_features'): return X.to_features() return X flatten = Generic(func=to_feat) trans = [('step_0', flatten)] + trans pipe = Pipeline(trans) return pipe, X, y
def test_sklearn_preproc(scale_encode_cls): pytest.xfail('This test doesnt test anything yet') step_cls = getattr(steps, scale_encode_cls) init_kwargs = {} # come up with some initialization kwargs p = Pipeline([steps.Flatten(), step_cls(**init_kwargs)]) # X, y, sample_weight = p.fit_transform(**data_source)
def ensemble_layer_2(pipe, **kw): '''A simple model for the second layer (model on models). RidgeCV is a good choice in the second layer since colinearity is expected among the predictions from the first layer that form an input matrix to the second layer''' return [Pipeline([RidgeCV()], **pipeline_kw)]
def ensemble_init_func(pipe, **kw): '''Create an ensemble of regression models to predict soil moisture where PCA, scaling, and/or log transformation may follow preamble steps of flattening a Dataset and extracting the Y data, among other preprocessors. Parameters: pipe: Ignored **kw: Keyword arguments: scalers: List of (name, scaler) tuples such as [('StandardScaler', steps.StandardScaler(with_mean=True)), ('RobustScaler', steps.RobustScaler(with_centering=True))] n_components: List of PCA # of components to try. May include None if skipping PCA step estimators: List of (name, estimator) tuples where estimator may be any scikit-learn-like regressor, e.g. [('estimator', LinearRegression())] log: Log transform step, e.g.: ('log', steps.ModifySample(log_scaler)) summary: String summary of premable steps to prepend to parameter summary Returns: ensemble: List of Pipeline instances ''' ensemble = [] scalers = kw['scalers'] n_components = kw['n_components'] pca = kw['pca'] estimators = kw['estimators'] preamble = kw['preamble'] summary_template = kw['summary'] minmax_bounds = kw['minmax_bounds'] log = kw['log'] for s_label_0, scale_0 in scalers: if 'MinMax' in s_label_0: # Make MinMaxScaler objects labels = [s_label_0 + repr(mb) for mb in minmax_bounds] scalers_with_params = [scale_0(*mb) for mb in minmax_bounds] scalers_with_params = zip(labels, scalers_with_params) elif scale_0: # Just keep the StandardScaler as is scalers_with_params = [(s_label_0, scale_0())] else: # No scaling scalers_with_params = [(s_label_0, None)] for s_label, scale in scalers_with_params: for n_c in n_components: for e_label, estimator in estimators: scale_step = [scale] if scale else [] if 'MinMax' in s_label: # Log transform only works with MinMaxScaler # and positive min bound scale_step += [log] pca_step = [pca()] if n_c and scale else [] new = Pipeline( preamble() + scale_step + pca_step + [estimator()], **pipeline_kw) if pca_step: new.set_params(pca__n_components=n_c) msg = '{} components'.format(n_c) else: msg = ' (None)' args = (s_label, msg, e_label) summary = ': Scaler: {} PCA: {} Estimator: {}'.format( *args) new.summary = summary_template + summary print(new.summary) ensemble.append(new) return ensemble
def test_poly(): s = flat_poly_var_kmeans p = Pipeline(s[:1]) flat, y, sample_weight = p.fit_transform(**data_source) assert hasattr(flat, 'flat') p = Pipeline(s[:2]) more_cols, _, _ = p.fit_transform(**data_source) assert more_cols.flat.shape[1] > flat.flat.shape[1] p = Pipeline(s[:3]) feat_sel = p.fit_transform(**data_source) assert isinstance(feat_sel, tuple) p = Pipeline(s) # thru KMeans # fit should always return a Pipeline instance (self after fitting) fitted = p.fit(**data_source) assert isinstance(fitted, Pipeline) assert isinstance(fitted.steps[-1][-1], KMeans) assert fitted._estimator.cluster_centers_.shape[0] == fitted.get_params( )['kmeans__n_clusters'] # predict should return KMeans's predict output pred = p.predict(**data_source) # fit_transform here should return the transform of the KMeans, # the distances in each dimension to the cluster centers. out = p.fit_transform(**data_source) assert isinstance(out, tuple) and len(out) == 3 X, _, _ = out assert X.shape[0] == pred.size
ELM_EXAMPLE_DATA_PATH = os.environ['ELM_EXAMPLE_DATA_PATH'] def make_example_y_data(X, y=None, sample_weight=None, **kwargs): fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values) y = fitted.predict(X.flat.values) return (X, y, sample_weight) pipeline_steps = [ steps.Flatten(), steps.ModifySample(make_example_y_data), ('top_n', steps.SelectPercentile(percentile=80, score_func=f_classif)), ('kmeans', MiniBatchKMeans(n_clusters=4)) ] pipeline = Pipeline(pipeline_steps, scoring=kmeans_aic) param_grid = { 'kmeans__n_clusters': list(range(5, 10)), 'control': { 'select_method': 'selNSGA2', 'crossover_method': 'cxTwoPoint', 'mutate_method': 'mutUniformInt', 'init_pop': 'random', 'indpb': 0.5, 'mutpb': 0.9, 'cxpb': 0.3, 'eta': 20, 'ngen': 2, 'mu': 4, 'k': 4, 'early_stop': {
if meta_is_day(load_hdf4_meta(f))] def sampler(fname, **kw): return (load_array(fname, band_specs=band_specs), None, None) data_source = { 'sampler': sampler, 'args_list': HDF4_FILES, } pipeline_steps = [steps.Flatten(), ('scaler', steps.StandardScaler()), ('pca', steps.Transform(IncrementalPCA(n_components=4), partial_fit_batches=2)), ('kmeans', MiniBatchKMeans(n_clusters=4, compute_labels=True)),] pipeline = Pipeline(pipeline_steps, scoring=kmeans_aic, scoring_kwargs=dict(score_weights=[-1])) def ensemble_init_func(pipe, **kw): return [pipe.new_with_params(kmeans__n_clusters=np.random.choice(range(6, 10))) for _ in range(4)] ensemble_kwargs = { 'model_selection': kmeans_model_averaging, 'model_selection_kwargs': { 'drop_n': 2, 'evolve_n': 2, }, 'ensemble_init_func': ensemble_init_func, 'ngen': 3, 'partial_fit_batches': 2,