def test_poly(): s = flat_poly_var_kmeans p = Pipeline(s[:1]) flat, y, sample_weight = p.fit_transform(**data_source) assert hasattr(flat, 'flat') p = Pipeline(s[:2]) more_cols, _, _ = p.fit_transform(**data_source) assert more_cols.flat.shape[1] > flat.flat.shape[1] p = Pipeline(s[:3]) feat_sel = p.fit_transform(**data_source) assert isinstance(feat_sel, tuple) p = Pipeline(s) # thru KMeans # fit should always return a Pipeline instance (self after fitting) fitted = p.fit(**data_source) assert isinstance(fitted, Pipeline) assert isinstance(fitted.steps[-1][-1], KMeans) assert fitted._estimator.cluster_centers_.shape[0] == fitted.get_params( )['kmeans__n_clusters'] # predict should return KMeans's predict output pred = p.predict(**data_source) # fit_transform here should return the transform of the KMeans, # the distances in each dimension to the cluster centers. out = p.fit_transform(**data_source) assert isinstance(out, tuple) and len(out) == 3 X, _, _ = out assert X.shape[0] == pred.size
def test_pipeline_feature_selection(): tag = selection_name = 'variance_selection' config = copy.deepcopy(BASE) with tmp_dirs_context(tag) as (train_path, predict_path, cwd): for idx, action in enumerate(config['run']): if 'train' in action or 'predict' in action: train_name = action.get('train', action.get('predict')) if 'pipeline' in action: if not isinstance(action['pipeline'], (list, tuple)): action['pipeline'] = config['pipelines'][ action['pipeline']] action['pipeline'] += [{ 'feature_selection': selection_name }] else: action['pipeline'] = [{ 'feature_selection': selection_name }] config2 = ConfigParser(config=BASE) config2.feature_selection[selection_name] = { 'method': 'VarianceThreshold', 'score_func': None, 'threshold': 0.08, } X = sampler() steps = pipeline.make_pipeline_steps(config2, action['pipeline']) pipe = Pipeline(steps) transform_models = None for repeats in range(5): XX, _, _ = pipe.fit_transform(X) assert XX.flat.shape[1] < 40
def tst_one_pipeline(pipeline, add_na_per_band=0, na_fields_as_str=True, delim='_'): from elm.sample_util.sample_pipeline import make_pipeline_steps sample = random_elm_store() if add_na_per_band: for idx, band in enumerate(sample.data_vars): band_arr = getattr(sample, band) val = band_arr.values inds = np.arange(val.size) np.random.shuffle(inds) x = inds // val.shape[0] y = inds % val.shape[0] slc = slice(None, add_na_per_band // 2) val[y[slc],x[slc]] = 99 * idx band_arr.attrs['missing{}value'.format(delim)] = 99 * idx slc = slice(add_na_per_band // 2, add_na_per_band) val[y[slc], x[slc]] = 199 * idx band_arr.attrs['invalid{}range'.format(delim)] = [198 * idx, 200 * idx] band_arr.attrs['valid{}range'.format(delim)] = [-1e12, 1e12] if na_fields_as_str: for field in ('missing{}value', 'invalid{}range', 'valid{}range'): field = field.format(delim) v = band_arr.attrs[field] if isinstance(v, list): band_arr.attrs[field] = ', '.join(map(str,v)) else: band_arr.attrs[field] = str(v) assert val[np.isnan(val)].size == 0 config = ConfigParser(config=make_config(pipeline, data_source)) pipe = Pipeline(make_pipeline_steps(config, pipeline)) new_es = pipe.fit_transform(sample) return sample, new_es[0]
def test_simple(): p = Pipeline([('a', steps.Flatten())]) # fit_transform should always return (X, y, sample_weight) X, y, sample_weight = p.fit_transform(**data_source) assert isinstance(X, ElmStore) assert hasattr(X, 'flat') assert y is None assert sample_weight is None
def test_feature_selection(feat_cls): pytest.xfail('This test doesnt test anything yet') step_cls = getattr(steps, feat_cls) init_kwargs = {} # come up with some initialization kwargs p = Pipeline( [steps.Flatten(), steps.ModifySample(get_y), step_cls(**init_kwargs)]) # X, y, sample_weight = p.fit_transform(**data_source)
def test_modify_sample(): '''steps.ModifySample should take any function and call it in Pipeline. The signature of the function should be: func(X, y=None, sample_weight=None, **kwargs) and it should return a tuple of: (X, y, sample_weight) ''' p = Pipeline([steps.Flatten(), steps.ModifySample(get_y)]) X, y, sample_weight = p.fit_transform(**data_source) assert X is not None assert isinstance(y, np.ndarray)
def test_sklearn_preproc(scale_encode_cls): pytest.xfail('This test doesnt test anything yet') step_cls = getattr(steps, scale_encode_cls) init_kwargs = {} # come up with some initialization kwargs p = Pipeline([steps.Flatten(), step_cls(**init_kwargs)]) # X, y, sample_weight = p.fit_transform(**data_source)