Exemple #1
0
def _run_model_selection(models, model_selection, model_selection_kwargs,
                         ngen, generation,
                         scoring_kwargs):
    '''Run a model selection after adding ngen and generation to kwargs
    and finding the right sorting function for fitness

    Returns:
        list of (tag, model) tuples'''
    model_selection_kwargs['ngen'] = ngen
    model_selection_kwargs['generation'] = generation
    scoring_kwargs = scoring_kwargs or {}
    score_weights = (scoring_kwargs or {}).get('score_weights', model_selection_kwargs.get('score_weights'))
    sort_fitness = scoring_kwargs.get('sort_fitness', model_selection_kwargs.get('sort_fitness')) or None
    if not sort_fitness:
        sort_fitness = pareto_front
    else:
        sort_fitness = import_callable(sort_fitness)
    kw = {k: v for k,v in model_selection_kwargs.items()
          if not k in ('score_weights',)}
    models = base_selection(models,
                            model_selection=model_selection,
                            sort_fitness=sort_fitness,
                            score_weights=score_weights,
                            **kw)
    models = _validate_ensemble_members(models)
    return models
Exemple #2
0
 def _import_score_func(self, **params):
     if 'score_func' in params:
         if isinstance(params['score_func'], str):
             sf = getattr(skfeat, params['score_func'], None)
             if not sf:
                 sf = import_callable(params['score_func'])
             params['score_func'] = sf
     return params
Exemple #3
0
 def fit_transform(self, X, y=None, sample_weight=None, **kwargs):
     from elm.sample_util.sample_pipeline import _split_pipeline_output
     kw = dict(y=y, sample_weight=sample_weight, **kwargs)
     kw.update(self.kwargs)
     func = import_callable(self.func)
     output = func(X, **kw)
     return _split_pipeline_output(output, X, y, sample_weight,
                                   'ModifySample')
Exemple #4
0
def take_geo_transform_from_meta(band_spec=None, required=True, **meta):
    if band_spec and getattr(band_spec, 'meta_to_geotransform', False):
        func = import_callable(band_spec.meta_to_geotransform)
        geo_transform = func(**meta)
        if not isinstance(geo_transform, Sequence) or len(geo_transform) != 6:
            raise ValueError(
                'band_spec.meta_to_geotransform {} did not return a sequence of len 6'
                .format(band_spec.meta_to_geotransform))
        return geo_transform
    elif required:
        geo_transform = grid_header_to_geo_transform(**meta)
        return geo_transform
    return None
Exemple #5
0
def create_sample_from_data_source(config=None, **data_source):
    '''Given sampling specs in a pipeline train or predict step,
    return pipe, a list of (func, args, kwargs) actions

    Params:
        :train_or_predict_dict: a "train" or "predict" dict from config
        :config:                full config
        :step:                  a dictionary that is the current step in the pipeline, like a "train" or "predict" step

    '''
    sampler_func = data_source['sampler']  # TODO: this needs to be
    # added to ConfigParser
    # validation (sampler requirement)
    sampler_func = import_callable(sampler_func)
    sampler_args = data_source.get('sampler_args') or ()
    if not isinstance(sampler_args, (tuple, list)):
        sampler_args = (sampler_args, )
    reader_name = data_source.get('reader') or None
    if isinstance(reader_name, str) and reader_name:
        if config and reader_name in config.readers:
            reader = config.readers[reader_name]
        _load_meta = partial(load_meta, reader=reader_name)
        _load_array = partial(load_array, reader=reader_name)
    elif isinstance(reader_name, dict):
        reader = reader_name
        _load_meta = import_callable(reader['load_meta'], True,
                                     reader['load_meta'])
        _load_array = import_callable(reader['load_array'], True,
                                      reader['load_array'])
    else:
        _load_array = load_array
        _load_meta = load_meta
    data_source['load_meta'] = _load_meta
    data_source['load_array'] = _load_array
    for k in data_source:
        if '_filter' in k and data_source[k] and k != 'geo_filters':
            data_source[k] = import_callable(data_source[k])
    return sampler_func(*sampler_args, **data_source)
Exemple #6
0
def config_to_pipeline(config, client=None):
    '''
    Run the elm config's train and predict "run"
    actions with dask client and config's updates
    based on args passed to elm-main, such as --train-only
    or --predict-only, or edits to ensemble settings, such as
    --ngen 4

    Parameters:
        :config: elm.config.ConfigParser instance
        :client: dask client or None
    '''
    from elm.sample_util.sample_pipeline import make_pipeline_steps

    _makedirs(config)
    idx_to_evo_params = ea_setup(config)
    for idx, step in enumerate(config.run):
        pipeline = step['pipeline']
        if 'train' in step:
            train = config.train[step['train']]
            pipe_steps = make_pipeline_steps(config, pipeline)
            cls = import_callable(train['model_init_class'])
            estimator = cls(**(train.get('model_init_kwargs') or {}))
            pipe_steps.append((step['train'], estimator))
            ensemble_kwargs = train.get('ensemble')
            if isinstance(ensemble_kwargs, str):
                ensemble_kwargs = config.ensembles[ensemble_kwargs]
            ensemble_kwargs['client'] = client
        data_source = step['data_source']
        if not isinstance(data_source, dict):
            data_source = config.data_sources[data_source]
        data_source['sampler'] = import_callable(data_source['sampler'])
        data_source['load_meta'] = load_meta
        data_source['load_array'] = load_array
        if callable(data_source.get('args_list')):
            kw = {k: v for k, v in data_source.items() if k != 'args_list'}
            data_source['args_list'] = tuple(data_source['args_list'](**kw))
        if 'train' in step and not getattr(config, 'PREDICT_ONLY', False):
            s = train.get('model_scoring')
            if s:
                scoring = config.model_scoring[s]
                scoring_kwargs = {
                    k: v
                    for k, v in scoring.items() if k != 'scoring'
                }
                scoring = import_callable(scoring['scoring'])
            else:
                scoring = None
                scoring_kwargs = {}
            if 'method_kwargs' in train:
                method_kwargs = train['method_kwargs']
            else:
                method_kwargs = {}
            if 'classes' in train:
                method_kwargs['classes'] = train['classes']
            ensemble_kwargs['method_kwargs'] = method_kwargs
            pipe = Pipeline(pipe_steps,
                            scoring=scoring,
                            scoring_kwargs=scoring_kwargs)
            evo_params = idx_to_evo_params.get(idx, None)
            if evo_params:
                kw = dict(evo_params=evo_params)
                kw.update(data_source)
                kw.update(ensemble_kwargs)
                pipe.fit_ea(**kw)
            else:
                kw = {}
                kw.update(data_source)
                kw.update(ensemble_kwargs)
                pipe.fit_ensemble(**kw)

            serialize_pipe(pipe, config.ELM_TRAIN_PATH, step['train'])
        elif 'predict' in step and not getattr(config, 'TRAIN_ONLY', False):
            pipe = load_pipe_from_tag(config.ELM_TRAIN_PATH, step['predict'])

        else:
            logger.info(
                'Do nothing for {} (has no "train" or "predict" key)'.format(
                    step))
        if 'predict' in step:
            # serialize is called with (prediction, sample, tag)
            serialize = partial(serialize_prediction, config)
            pipe.predict_many(serialize=serialize, **data_source)
Exemple #7
0
def ensemble(pipe,
             ngen,
             X=None,
             y=None,
             sample_weight=None,
             sampler=None,
             args_list=None,
             client=None,
             init_ensemble_size=1,
             saved_ensemble_size=None,
             ensemble_init_func=None,
             models_share_sample=True,
             model_selection=None,
             model_selection_kwargs=None,
             scoring_kwargs=None,
             method='fit',
             partial_fit_batches=1,
             classes=None,
             method_kwargs=None,
             **data_source):
    '''Fit or partial_fit an ensemble of models to a series of samples

    Call this function from an elm.pipeline.Pipeline instance's methods:
        "fit_ensemble"
        "fit_transform_ensemble"
        "transform_ensemble"

    Parameters:
        pipe: instance of elm.pipeline.Pipeline
        ngen: number of ensemble generations
        X:    earthio.ElmStore, if not using "sampler" and "args_list"
        y:    numpy array if not using "sampler" and "args_list", or None
              if not needed by Pipeline
        sample_weight: numpy array if not using "sampler" and "args_list", or None
              if not needed by Pipeline
        sampler: Callable - required if not giving X.  Called at least once
                on each element of args_list where each element is unpacked
                with *one_element_of_args_list
        args_list: List of args - required if not giving X.  See sampler above
        client: dask-distributed or ThreadPool client
        init_ensemble_size: number of ensemble members, ignored if giving
              ensemble_init_func
        saved_ensemble_size: how many members to keep at final generation
        ensemble_init_func: Callable to return list of elm.pipeline.Pipeline
             instances that initialize ensemble
        models_share_sample: If True, ensure that in each generation, every
             member is fit to the same sample.  If False, fit every model
             to every sample
        model_selection: Callable after each generation to take a list of
            (tag, Pipeline) tuples and return a list of new such tuples, or
            None or repeatedly train each model on each generation without
            replacement of model parameters
        model_selection_kwargs: kwargs passed to model_selection
        scoring_kwargs: kwargs that are passed to score_one_model.
                See also elm.model_selection.scoring
        method: This is the method of Pipeline that called this ensemble
            function, typically "fit"
        classes: Unique sequence of class integers passed to supervised
            classifiers that need the known y classes.
        method_kwargs: any other arguments to pass to method
        **data_source: keywords passed to "sampler" if given
    Returns:

        new_models: list of (tag, Pipeline instance) tuples on which
            "predict_many" can be called
    '''
    get_func = _find_get_func_for_client(client)
    fit_score_kwargs = method_kwargs or {}
    if not 'classes' in fit_score_kwargs and classes is not None:
        fit_score_kwargs['classes'] = classes
    model_selection_kwargs = model_selection_kwargs or {}
    ensemble_size = init_ensemble_size or 1
    partial_fit_batches = partial_fit_batches or 1
    if partial_fit_batches > 1:
        method = 'partial_fit'
    if not ensemble_init_func:
        models = tuple(copy.deepcopy(pipe) for _ in range(ensemble_size))
    else:
        ensemble_init_func = import_callable(ensemble_init_func)
        models = ensemble_init_func(pipe, ensemble_size=ensemble_size)
    logger.info("Init ensemble: {} members".format(len(models)))
    if model_selection:
        model_selection = import_callable(model_selection)
    final_names = []
    dsk = make_samples_dask(X, y, sample_weight, pipe, args_list, sampler,
                            data_source)
    models = tuple(
        zip(('tag_{}'.format(idx) for idx in range(len(models))), models))
    sample_keys = list(dsk)
    if models_share_sample:
        random.shuffle(sample_keys)
        gen_to_sample_key = {
            gen: s
            for gen, s in enumerate(sample_keys[:ngen])
        }
    sample_keys = tuple(sample_keys)
    for gen in range(ngen):
        if models_share_sample:
            sample_keys_passed = (gen_to_sample_key[gen % len(sample_keys)], )
        else:
            sample_keys_passed = sample_keys
        logger.info('Ensemble generation {} of {} - ({} estimators) '.format(
            gen + 1, ngen, len(models)))
        msg = (len(models), len(sample_keys_passed), partial_fit_batches,
               method,
               len(models) * len(sample_keys_passed) * partial_fit_batches,
               gen + 1, ngen)
        logger.info(
            'Ensemble Generation {5} of {6}: ({0} members x {1} samples x {2} calls) = {4} {3} calls this gen'
            .format(*msg))
        dsk, model_keys, new_models_name = _one_generation_dask_graph(
            dsk, models, fit_score_kwargs, sample_keys_passed,
            partial_fit_batches, gen, method)
        if get_func is None:
            new_models = tuple(dask.get(dsk, new_models_name))
        else:
            new_models = tuple(get_func(dsk, new_models_name))
        models = tuple(zip(model_keys, new_models))
        logger.info('Trained {} estimators'.format(len(models)))
        if model_selection:
            models = _run_model_selection(models, model_selection,
                                          model_selection_kwargs or {}, ngen,
                                          gen, scoring_kwargs)

        else:
            pass  # Just training all ensemble members
            # without replacing / re-ininializing / editing
            # the model params
    if saved_ensemble_size:
        final_models = models[:saved_ensemble_size]
    else:
        final_models = models

    return final_models
Exemple #8
0
def make_pipeline_steps(config, pipeline):
    '''Turn the config's "pipeline" into a list of steps
    to pass to elm.pipeline.Pipeline

    Params:
        :config: validated config from elm.config.ConfigParser
        :step:   a dictionary that is one step of a "pipeline" list

    This is used by :func:``elm.pipeline.parse_run_config``
    '''
    actions = []
    for action_idx, action in enumerate(pipeline):
        is_dic = isinstance(action, dict)
        if not is_dic:
            step_cls = action
        elif 'feature_selection' in action:
            _feature_selection = copy.deepcopy(
                config.feature_selection[action['feature_selection']])
            kw = _feature_selection.copy()
            kw.update(action)
            scaler = _feature_selection['method']
            scaler = import_callable(getattr(skfeat, scaler, scaler))
            if 'func_kwargs' in _feature_selection:
                func = import_callable(_feature_selection['func'])
                scaler = partial(func, feature_selection['func_kwargs'])
                _feature_selection['func'] = func
            kw = {
                k: v
                for k, v in _feature_selection.items()
                if k not in ('func_kwargs', 'method')
            }
            cls = SKLEARN_PREPROCESSING[_feature_selection['method']]
            step_name = action['feature_selection']
            step_cls = cls(**kw)
        elif 'transform' in action:
            trans = config.transform[action['transform']]
            cls = import_callable(trans['model_init_class'])
            kw = trans.get('model_init_kwargs') or {}
            kw_filter = {
                k: v
                for k, v in kw.items() if k != 'partial_fit_batches'
            }
            t = cls(**kw_filter)
            pfb = trans.get('partial_fit_batches',
                            kw.get('partial_fit_batches'))
            step_name = action['transform']
            step_cls = steps.Transform(t, partial_fit_batches=pfb)
        elif 'sklearn_preprocessing' in action:
            _sklearn_preprocessing = config.sklearn_preprocessing[
                action['sklearn_preprocessing']]
            scaler = _sklearn_preprocessing['method']
            scaler = getattr(skpre, scaler, scaler)
            kw = {
                k: v
                for k, v in _sklearn_preprocessing.items()
                if not k in ('method', 'func_kwargs')
            }
            if 'func' in _sklearn_preprocessing:
                kw['func'] = import_callable(_sklearn_preprocessing['func'])
            cls = SKLEARN_PREPROCESSING[_sklearn_preprocessing['method']]
            step_name = action['sklearn_preprocessing']
            step_cls = cls(**kw)
        elif any(k in CHANGE_COORDS_ACTIONS for k in action):
            _sp_step = [k for k in action if k in CHANGE_COORDS_ACTIONS][0]
            step_name = _sp_step
            for att in dir(steps):
                if isinstance(getattr(steps, att), type):
                    if getattr(getattr(steps, att), '_sp_step',
                               None) == _sp_step:
                        step_cls = getattr(steps,
                                           att).from_config_dict(**action)
                        break

        else:
            # add items to actions of the form:
            # (
            #   module_colon_func_name_as_string,        # string
            #   args_to_func,                            # tuple
            #   kwargs_to_func                           # dict
            # )
            # NOTE also add the key name, like 'transform' to the top of
            # elm.config.load_config global variable:
            # "SAMPLE_PIPELINE_ACTIONS"
            raise NotImplementedError(
                'pipeline action {} not recognized.'.format(action))
        actions.append((step_name, step_cls))
    return actions