Esempio n. 1
0
def test_poly():
    s = flat_poly_var_kmeans
    p = Pipeline(s[:1])
    flat, y, sample_weight = p.fit_transform(**data_source)
    assert hasattr(flat, 'flat')
    p = Pipeline(s[:2])
    more_cols, _, _ = p.fit_transform(**data_source)
    assert more_cols.flat.shape[1] > flat.flat.shape[1]
    p = Pipeline(s[:3])
    feat_sel = p.fit_transform(**data_source)
    assert isinstance(feat_sel, tuple)
    p = Pipeline(s)  # thru KMeans
    # fit should always return a Pipeline instance (self after fitting)
    fitted = p.fit(**data_source)
    assert isinstance(fitted, Pipeline)
    assert isinstance(fitted.steps[-1][-1], KMeans)
    assert fitted._estimator.cluster_centers_.shape[0] == fitted.get_params(
    )['kmeans__n_clusters']
    # predict should return KMeans's predict output
    pred = p.predict(**data_source)
    # fit_transform here should return the transform of the KMeans,
    # the distances in each dimension to the cluster centers.
    out = p.fit_transform(**data_source)
    assert isinstance(out, tuple) and len(out) == 3
    X, _, _ = out
    assert X.shape[0] == pred.size
Esempio n. 2
0
def test_pipeline_feature_selection():
    tag = selection_name = 'variance_selection'
    config = copy.deepcopy(BASE)
    with tmp_dirs_context(tag) as (train_path, predict_path, cwd):
        for idx, action in enumerate(config['run']):
            if 'train' in action or 'predict' in action:
                train_name = action.get('train', action.get('predict'))
                if 'pipeline' in action:
                    if not isinstance(action['pipeline'], (list, tuple)):
                        action['pipeline'] = config['pipelines'][
                            action['pipeline']]
                    action['pipeline'] += [{
                        'feature_selection': selection_name
                    }]
                else:
                    action['pipeline'] = [{
                        'feature_selection': selection_name
                    }]

                config2 = ConfigParser(config=BASE)
                config2.feature_selection[selection_name] = {
                    'method': 'VarianceThreshold',
                    'score_func': None,
                    'threshold': 0.08,
                }
                X = sampler()
                steps = pipeline.make_pipeline_steps(config2,
                                                     action['pipeline'])
                pipe = Pipeline(steps)
                transform_models = None
                for repeats in range(5):
                    XX, _, _ = pipe.fit_transform(X)
                    assert XX.flat.shape[1] < 40
Esempio n. 3
0
def tst_one_pipeline(pipeline,
                     add_na_per_band=0,
                     na_fields_as_str=True,
                     delim='_'):
    from elm.sample_util.sample_pipeline import make_pipeline_steps
    sample = random_elm_store()
    if add_na_per_band:
        for idx, band in enumerate(sample.data_vars):
            band_arr = getattr(sample, band)
            val = band_arr.values
            inds = np.arange(val.size)
            np.random.shuffle(inds)
            x = inds // val.shape[0]
            y = inds % val.shape[0]
            slc = slice(None, add_na_per_band // 2)
            val[y[slc],x[slc]] = 99 * idx
            band_arr.attrs['missing{}value'.format(delim)] = 99 * idx
            slc = slice(add_na_per_band // 2, add_na_per_band)
            val[y[slc], x[slc]] = 199 * idx
            band_arr.attrs['invalid{}range'.format(delim)] = [198 * idx, 200 * idx]
            band_arr.attrs['valid{}range'.format(delim)] = [-1e12, 1e12]
            if na_fields_as_str:
                for field in ('missing{}value', 'invalid{}range', 'valid{}range'):
                    field = field.format(delim)
                    v = band_arr.attrs[field]
                    if isinstance(v, list):
                        band_arr.attrs[field] = ', '.join(map(str,v))
                    else:
                        band_arr.attrs[field] = str(v)
            assert val[np.isnan(val)].size == 0
    config = ConfigParser(config=make_config(pipeline, data_source))
    pipe = Pipeline(make_pipeline_steps(config, pipeline))
    new_es = pipe.fit_transform(sample)
    return sample, new_es[0]
Esempio n. 4
0
def test_simple():

    p = Pipeline([('a', steps.Flatten())])
    # fit_transform should always return (X, y, sample_weight)
    X, y, sample_weight = p.fit_transform(**data_source)
    assert isinstance(X, ElmStore)
    assert hasattr(X, 'flat')
    assert y is None
    assert sample_weight is None
Esempio n. 5
0
def test_feature_selection(feat_cls):
    pytest.xfail('This test doesnt test anything yet')
    step_cls = getattr(steps, feat_cls)
    init_kwargs = {}  # come up with some initialization kwargs
    p = Pipeline(
        [steps.Flatten(),
         steps.ModifySample(get_y),
         step_cls(**init_kwargs)])  #
    X, y, sample_weight = p.fit_transform(**data_source)
Esempio n. 6
0
def test_modify_sample():
    '''steps.ModifySample should take any function and call it in Pipeline.

    The signature of the function should be:

    func(X, y=None, sample_weight=None, **kwargs)

    and it should return a tuple of:

    (X, y, sample_weight)

    '''
    p = Pipeline([steps.Flatten(), steps.ModifySample(get_y)])
    X, y, sample_weight = p.fit_transform(**data_source)
    assert X is not None
    assert isinstance(y, np.ndarray)
Esempio n. 7
0
def test_sklearn_preproc(scale_encode_cls):
    pytest.xfail('This test doesnt test anything yet')
    step_cls = getattr(steps, scale_encode_cls)
    init_kwargs = {}  # come up with some initialization kwargs
    p = Pipeline([steps.Flatten(), step_cls(**init_kwargs)])  #
    X, y, sample_weight = p.fit_transform(**data_source)