Example #1
0
def test_pipeline_feature_selection():
    tag = selection_name = 'variance_selection'
    config = copy.deepcopy(BASE)
    with tmp_dirs_context(tag) as (train_path, predict_path, cwd):
        for idx, action in enumerate(config['run']):
            if 'train' in action or 'predict' in action:
                train_name = action.get('train', action.get('predict'))
                if 'pipeline' in action:
                    if not isinstance(action['pipeline'], (list, tuple)):
                        action['pipeline'] = config['pipelines'][
                            action['pipeline']]
                    action['pipeline'] += [{
                        'feature_selection': selection_name
                    }]
                else:
                    action['pipeline'] = [{
                        'feature_selection': selection_name
                    }]

                config2 = ConfigParser(config=BASE)
                config2.feature_selection[selection_name] = {
                    'method': 'VarianceThreshold',
                    'score_func': None,
                    'threshold': 0.08,
                }
                X = sampler()
                steps = pipeline.make_pipeline_steps(config2,
                                                     action['pipeline'])
                pipe = Pipeline(steps)
                transform_models = None
                for repeats in range(5):
                    XX, _, _ = pipe.fit_transform(X)
                    assert XX.flat.shape[1] < 40
Example #2
0
def tst_one_pipeline(pipeline,
                     add_na_per_band=0,
                     na_fields_as_str=True,
                     delim='_'):
    from elm.sample_util.sample_pipeline import make_pipeline_steps
    sample = random_elm_store()
    if add_na_per_band:
        for idx, band in enumerate(sample.data_vars):
            band_arr = getattr(sample, band)
            val = band_arr.values
            inds = np.arange(val.size)
            np.random.shuffle(inds)
            x = inds // val.shape[0]
            y = inds % val.shape[0]
            slc = slice(None, add_na_per_band // 2)
            val[y[slc],x[slc]] = 99 * idx
            band_arr.attrs['missing{}value'.format(delim)] = 99 * idx
            slc = slice(add_na_per_band // 2, add_na_per_band)
            val[y[slc], x[slc]] = 199 * idx
            band_arr.attrs['invalid{}range'.format(delim)] = [198 * idx, 200 * idx]
            band_arr.attrs['valid{}range'.format(delim)] = [-1e12, 1e12]
            if na_fields_as_str:
                for field in ('missing{}value', 'invalid{}range', 'valid{}range'):
                    field = field.format(delim)
                    v = band_arr.attrs[field]
                    if isinstance(v, list):
                        band_arr.attrs[field] = ', '.join(map(str,v))
                    else:
                        band_arr.attrs[field] = str(v)
            assert val[np.isnan(val)].size == 0
    config = ConfigParser(config=make_config(pipeline, data_source))
    pipe = Pipeline(make_pipeline_steps(config, pipeline))
    new_es = pipe.fit_transform(sample)
    return sample, new_es[0]
Example #3
0
def test_kmeans_simple_X(client=None):
    pipe = Pipeline([steps.Flatten(),
                     MiniBatchKMeans(n_clusters=6)])
    fitted = pipe.fit_ensemble(X=X, **ENSEMBLE_KWARGS)
    _train_asserts(fitted, ENSEMBLE_KWARGS['saved_ensemble_size'])
    pred = fitted.predict_many(X=X)
    assert len(pred) == len(fitted.ensemble)
Example #4
0
def test_kmeans_simple_sampler(client=None):
    pipe = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=6)])
    kw = SAMPLER_DATA_SOURCE.copy()
    kw.update(ENSEMBLE_KWARGS)
    fitted = pipe.fit_ensemble(**kw)
    ens = fitted.ensemble
    _train_asserts(fitted, ENSEMBLE_KWARGS['saved_ensemble_size'])
    pred = fitted.predict_many(**SAMPLER_DATA_SOURCE)
    assert len(pred) == len(SAMPLER_DATA_SOURCE['args_list']) * len(ens)
Example #5
0
def test_predict():
    p = Pipeline(flat_poly_var_kmeans)
    # sample below is X, y, sample_weight
    sample = p.create_sample(**data_source)
    # fitted is a Pipeline instance (it returns self after fitting)
    fitted = p.fit(*sample)
    # this should be a numpy array
    pred = fitted.predict(*sample)
    assert isinstance(pred, np.ndarray)
Example #6
0
def test_simple():

    p = Pipeline([('a', steps.Flatten())])
    # fit_transform should always return (X, y, sample_weight)
    X, y, sample_weight = p.fit_transform(**data_source)
    assert isinstance(X, ElmStore)
    assert hasattr(X, 'flat')
    assert y is None
    assert sample_weight is None
Example #7
0
def test_feature_selection(feat_cls):
    pytest.xfail('This test doesnt test anything yet')
    step_cls = getattr(steps, feat_cls)
    init_kwargs = {}  # come up with some initialization kwargs
    p = Pipeline(
        [steps.Flatten(),
         steps.ModifySample(get_y),
         step_cls(**init_kwargs)])  #
    X, y, sample_weight = p.fit_transform(**data_source)
Example #8
0
def test_supervised_feat_select_X_y(client=None):
    '''Has a ModifySample step to get necessary y data'''
    pipe = Pipeline([steps.Flatten(),
            steps.SelectPercentile(score_func=f_classif, percentile=50),
            SGDClassifier()])
    en = dict(method_kwargs=dict(classes=[0, 1, 2]), **ENSEMBLE_KWARGS)
    en.update(X_Y_DATA_SOURCE)
    fitted = pipe.fit_ensemble(**en)
    _train_asserts(fitted, en['saved_ensemble_size'])
    pred = fitted.predict_many(**X_Y_DATA_SOURCE)
    assert len(pred) == len(fitted.ensemble)
Example #9
0
def test_simple():
    p = Pipeline([steps.Flatten(), MiniBatchKMeans(n_clusters=5),])
    args_list = [(100, 200, 5)] * 10 # (height, width, bands)
    data_source = dict(sampler=example_sampler, args_list=args_list)
    ensemble_kw = dict(ngen=2, init_ensemble_size=2)
    ensemble_kw.update(data_source)
    fitted = p.fit_ensemble(**ensemble_kw)
    tagged_fitted_models = fitted.ensemble
    (tag1, model1), (tag2, model2) = tagged_fitted_models # ensemble size of 2 here
    X = example_sampler(100, 400, 5)
    pred1 = model1.predict(X)
    pred2 = model2.predict(X)
    assert pred1.shape == pred2.shape == (400 * 100,)
Example #10
0
def test_modify_sample():
    '''steps.ModifySample should take any function and call it in Pipeline.

    The signature of the function should be:

    func(X, y=None, sample_weight=None, **kwargs)

    and it should return a tuple of:

    (X, y, sample_weight)

    '''
    p = Pipeline([steps.Flatten(), steps.ModifySample(get_y)])
    X, y, sample_weight = p.fit_transform(**data_source)
    assert X is not None
    assert isinstance(y, np.ndarray)
Example #11
0
def _setup(config=None):
    '''Return the config above and the param_grid'''
    from elm.sample_util.sample_pipeline import make_pipeline_steps
    from elm.pipeline import Pipeline
    if not config:
        config = ConfigParser(config=yaml.load(CONFIG_STR))
    sample_steps = make_pipeline_steps(config, config.run[0]['pipeline'])
    estimator = [('kmeans', MiniBatchKMeans(**config.train['kmeans']['model_init_kwargs']))]
    pipe = Pipeline(sample_steps + estimator)
    idx_to_param_grid = ea_setup(config)
    return config, pipe, idx_to_param_grid
Example #12
0
def test_kmeans_model_selection(client=None):

    pipe = Pipeline([
        steps.Flatten(), ('pca', steps.Transform(IncrementalPCA())),
        ('kmeans', MiniBatchKMeans(n_clusters=5))
    ],
                    scoring=kmeans_aic,
                    scoring_kwargs={'score_weights': [-1]})

    def samp(*args, **kwargs):
        return random_elm_store(bands=12, mn=0, mx=1, height=20, width=40)

    en = ENSEMBLE_KWARGS.copy()
    n_clusters_choices = list(range(3, 10))

    def init(pipe, **kwargs):
        estimators = []
        for _ in range(100):
            n_components = np.random.choice(np.arange(2, 6))
            n_clusters = np.random.choice(n_clusters_choices)
            estimator = copy.deepcopy(pipe)
            estimator.set_params(kmeans__n_clusters=n_clusters,
                                 pca__n_components=n_components)
            estimators.append(estimator)
        return estimators

    en['ngen'] = 20
    en['model_scoring'] = kmeans_aic
    en['ensemble_init_func'] = init
    en['model_selection_kwargs'] = dict(drop_n=30,
                                        evolve_n=30,
                                        choices=n_clusters_choices)
    en['model_selection'] = kmeans_model_averaging
    sa = SAMPLER_DATA_SOURCE.copy()
    sa['sampler'] = samp
    en.update(sa)
    fitted = pipe.fit_ensemble(**en)
    assert len(fitted.ensemble) == en['saved_ensemble_size']
    preds = fitted.predict_many(**sa)
    assert len(preds) == len(fitted.ensemble) * len(
        SAMPLER_DATA_SOURCE['args_list'])
Example #13
0
def test_set_params_get_params():
    '''Assert setting with double underscore
    parameter names will work ok'''
    p = Pipeline(flat_poly_var_kmeans)
    kw = dict(kmeans__n_clusters=9,
              poly__interaction_only=False,
              var__threshold=1e-8)
    p.set_params(**kw)
    params = p.get_params()
    for k, v in kw.items():
        assert k in params and params[k] == v
    with pytest.raises(ValueError):
        p.set_params(kmeans_n_clusters=9)  # no double underscore
Example #14
0
def load_pipe_from_tag(elm_train_path, tag):
    '''Calls Pipeline.load for a tagged saved Pipeline in elm_train_path

    Parameters:
        :elm_train_path:  root dir for serializing training outputs
        :tag:             tag that was given to elm.pipeline.serialize.serialize_pipe

    Returns:
        :elm.pipeline.Pipeline: instance (fitted if it was fitted before saving)
        '''
    from elm.pipeline import Pipeline
    logger.debug('Load {} from {}'.format(tag, elm_train_path))
    path = _get_path_for_tag(elm_train_path, tag)
    if not os.path.exists(path):
        raise IOError('Cannot load from {} (does not exist)'.format(path))
    return Pipeline.load(path)
Example #15
0
def test_pipeline_new_with_params():
    p = Pipeline([
        steps.SelectCanvas('band_1'),
        steps.Flatten(),
        ('pca', steps.Transform(IncrementalPCA(n_components=3))),
        ('kmeans', KMeans(n_clusters=4))
    ])
    p.fit(random_elm_store())
    p.predict(random_elm_store())
    assert p.steps[-1][-1].cluster_centers_.shape[0] == 4
    p2 = p.new_with_params(kmeans__n_clusters=7, pca__n_components=2)
    with pytest.raises(NotFittedError):
        p2.predict(random_elm_store())
    p2.fit(random_elm_store())
    assert p2.steps[-1][-1].cluster_centers_.shape[0] == 7
Example #16
0
def new_pipeline(*args, flatten_first=True):
    trans = []
    for idx, model in enumerate(args):
        parts = model._cls.__name__.split('.')
        name = parts[-1]
        if any(part in SKIP for part in parts):
            pytest.skip('{} - not implemented'.format(model._cls.__name__))
        out = get_params_for_est(model, name)
        if idx == 0:
            X, y, params, data_kw = out
        else:
            _, _, params, data_kw = out
        if 'score_func' in params: # some estimators require "score_func"
                                   # as an argument (and hence y in cases
                                   # where y may not be required by
                                   # other estimators in Pipeline instance)
            if y is None:
                val = X.to_features().features.values
                y = val.dot(np.random.uniform(0, 1, val.shape[1]))
        trans.append(('step_{}'.format(idx + 1), model(**params)))
        if data_kw['is_classifier']:
            y = (y > y.mean()).astype(np.int32)

    if flatten_first:
        # Add a step to convert first from MLDataset with
        # >=1 DataArrays to a single one with a
        # "features" DataArray - see "to_features" in
        # xarray_filters
        def to_feat(X, y=None):
            if hasattr(X, 'to_features'):
                return X.to_features()
            return X
        flatten = Generic(func=to_feat)
        trans = [('step_0', flatten)] + trans
    pipe = Pipeline(trans)
    return pipe, X, y
Example #17
0
def test_sklearn_preproc(scale_encode_cls):
    pytest.xfail('This test doesnt test anything yet')
    step_cls = getattr(steps, scale_encode_cls)
    init_kwargs = {}  # come up with some initialization kwargs
    p = Pipeline([steps.Flatten(), step_cls(**init_kwargs)])  #
    X, y, sample_weight = p.fit_transform(**data_source)
def ensemble_layer_2(pipe, **kw):
    '''A simple model for the second layer (model on models).
    RidgeCV is a good choice in the second layer since
    colinearity is expected among the predictions from the
    first layer that form an input matrix to the second layer'''
    return [Pipeline([RidgeCV()], **pipeline_kw)]
def ensemble_init_func(pipe, **kw):
    '''Create an ensemble of regression models to predict soil moisture
    where PCA, scaling, and/or log transformation may follow preamble
    steps of flattening a Dataset and extracting the Y data, among other
    preprocessors.

    Parameters:
        pipe: Ignored
        **kw: Keyword arguments:
            scalers: List of (name, scaler) tuples such as
                     [('StandardScaler', steps.StandardScaler(with_mean=True)),
                      ('RobustScaler', steps.RobustScaler(with_centering=True))]
            n_components: List of PCA # of components to try. May include None
                          if skipping PCA step
            estimators: List of (name, estimator) tuples where estimator
                        may be any scikit-learn-like regressor, e.g.
                        [('estimator', LinearRegression())]
            log:        Log transform step, e.g.:
                        ('log', steps.ModifySample(log_scaler))
            summary:    String summary of premable steps to prepend to
                        parameter summary

    Returns:
        ensemble: List of Pipeline instances
    '''
    ensemble = []
    scalers = kw['scalers']
    n_components = kw['n_components']
    pca = kw['pca']
    estimators = kw['estimators']
    preamble = kw['preamble']
    summary_template = kw['summary']
    minmax_bounds = kw['minmax_bounds']
    log = kw['log']

    for s_label_0, scale_0 in scalers:
        if 'MinMax' in s_label_0:
            # Make MinMaxScaler objects
            labels = [s_label_0 + repr(mb) for mb in minmax_bounds]
            scalers_with_params = [scale_0(*mb) for mb in minmax_bounds]
            scalers_with_params = zip(labels, scalers_with_params)
        elif scale_0:
            # Just keep the StandardScaler as is
            scalers_with_params = [(s_label_0, scale_0())]
        else:
            # No scaling
            scalers_with_params = [(s_label_0, None)]
        for s_label, scale in scalers_with_params:
            for n_c in n_components:
                for e_label, estimator in estimators:
                    scale_step = [scale] if scale else []
                    if 'MinMax' in s_label:
                        # Log transform only works with MinMaxScaler
                        # and positive min bound
                        scale_step += [log]
                    pca_step = [pca()] if n_c and scale else []
                    new = Pipeline(
                        preamble() + scale_step + pca_step + [estimator()],
                        **pipeline_kw)
                    if pca_step:
                        new.set_params(pca__n_components=n_c)
                        msg = '{} components'.format(n_c)
                    else:
                        msg = ' (None)'
                    args = (s_label, msg, e_label)
                    summary = ': Scaler: {} PCA: {} Estimator: {}'.format(
                        *args)
                    new.summary = summary_template + summary
                    print(new.summary)
                    ensemble.append(new)
    return ensemble
Example #20
0
def test_poly():
    s = flat_poly_var_kmeans
    p = Pipeline(s[:1])
    flat, y, sample_weight = p.fit_transform(**data_source)
    assert hasattr(flat, 'flat')
    p = Pipeline(s[:2])
    more_cols, _, _ = p.fit_transform(**data_source)
    assert more_cols.flat.shape[1] > flat.flat.shape[1]
    p = Pipeline(s[:3])
    feat_sel = p.fit_transform(**data_source)
    assert isinstance(feat_sel, tuple)
    p = Pipeline(s)  # thru KMeans
    # fit should always return a Pipeline instance (self after fitting)
    fitted = p.fit(**data_source)
    assert isinstance(fitted, Pipeline)
    assert isinstance(fitted.steps[-1][-1], KMeans)
    assert fitted._estimator.cluster_centers_.shape[0] == fitted.get_params(
    )['kmeans__n_clusters']
    # predict should return KMeans's predict output
    pred = p.predict(**data_source)
    # fit_transform here should return the transform of the KMeans,
    # the distances in each dimension to the cluster centers.
    out = p.fit_transform(**data_source)
    assert isinstance(out, tuple) and len(out) == 3
    X, _, _ = out
    assert X.shape[0] == pred.size
Example #21
0
ELM_EXAMPLE_DATA_PATH = os.environ['ELM_EXAMPLE_DATA_PATH']


def make_example_y_data(X, y=None, sample_weight=None, **kwargs):
    fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values)
    y = fitted.predict(X.flat.values)
    return (X, y, sample_weight)


pipeline_steps = [
    steps.Flatten(),
    steps.ModifySample(make_example_y_data),
    ('top_n', steps.SelectPercentile(percentile=80, score_func=f_classif)),
    ('kmeans', MiniBatchKMeans(n_clusters=4))
]
pipeline = Pipeline(pipeline_steps, scoring=kmeans_aic)
param_grid = {
    'kmeans__n_clusters': list(range(5, 10)),
    'control': {
        'select_method': 'selNSGA2',
        'crossover_method': 'cxTwoPoint',
        'mutate_method': 'mutUniformInt',
        'init_pop': 'random',
        'indpb': 0.5,
        'mutpb': 0.9,
        'cxpb': 0.3,
        'eta': 20,
        'ngen': 2,
        'mu': 4,
        'k': 4,
        'early_stop': {
Example #22
0
              if meta_is_day(load_hdf4_meta(f))]

def sampler(fname, **kw):
    return (load_array(fname, band_specs=band_specs), None, None)

data_source = {
    'sampler': sampler,
    'args_list': HDF4_FILES,
}

pipeline_steps = [steps.Flatten(),
                  ('scaler', steps.StandardScaler()),
                  ('pca', steps.Transform(IncrementalPCA(n_components=4), partial_fit_batches=2)),
                  ('kmeans', MiniBatchKMeans(n_clusters=4, compute_labels=True)),]
pipeline = Pipeline(pipeline_steps,
                    scoring=kmeans_aic,
                    scoring_kwargs=dict(score_weights=[-1]))

def ensemble_init_func(pipe, **kw):
    return [pipe.new_with_params(kmeans__n_clusters=np.random.choice(range(6, 10)))
            for _ in range(4)]

ensemble_kwargs = {
    'model_selection': kmeans_model_averaging,
    'model_selection_kwargs': {
        'drop_n': 2,
        'evolve_n': 2,
    },
    'ensemble_init_func': ensemble_init_func,
    'ngen': 3,
    'partial_fit_batches': 2,