Ejemplo n.º 1
0
def predict_many(data_source,
                 saved_model_tag=None,
                 ensemble=None,
                 client=None,
                 serialize=None,
                 to_raster=True,
                 elm_predict_path=None):
    '''See elm.pipeline.Pipeline.predict_many method

    '''

    env = parse_env_vars()
    elm_predict_path = elm_predict_path or env.get('ELM_PREDICT_PATH')
    if serialize and elm_predict_path and not os.path.exists(elm_predict_path):
        os.mkdir(elm_predict_path)
    pipe_example = ensemble[0][1]
    ds = data_source.copy()
    X = ds.pop('X', None)
    y = ds.pop('y', None)
    args_list = ds.pop('args_list', None)
    sampler = ds.pop('sampler', None)
    dsk = make_samples_dask(X, y, None, pipe_example, args_list, sampler, ds)
    sample_keys = tuple(dsk)
    args_list = tuple(itertools.product(sample_keys, ensemble))
    keys = []
    last_file_name = None
    for idx, (sample_key, (estimator_tag, estimator)) in enumerate(args_list):
        name = _next_name('predict_many')
        predict_tag = '{}-{}'.format(estimator_tag, sample_key)
        if saved_model_tag:
            predict_tag += '-' + saved_model_tag
        dsk[name] = (
            _predict_one_sample_one_arg,
            estimator,
            serialize,
            to_raster,
            predict_tag,
            elm_predict_path,
            sample_key,
        )

        keys.append(name)
    logger.info('Predict {} estimator(s) and {} sample(s) '
                '({} combination[s])'.format(len(ensemble), len(sample_keys),
                                             len(args_list)))
    preds = []
    if client is None:
        new = dask.get(dsk, keys)
    else:
        new = client.get(dsk, keys)
    return tuple(itertools.chain.from_iterable(new))
Ejemplo n.º 2
0
def evolve_train(pipe,
                 evo_params,
                 X=None,
                 y=None,
                 sample_weight=None,
                 sampler=None,
                 args_list=None,
                 client=None,
                 init_ensemble_size=1,
                 saved_ensemble_size=None,
                 ensemble_init_func=None,
                 scoring_kwargs=None,
                 method='fit',
                 partial_fit_batches=1,
                 classes=None,
                 method_kwargs=None,
                 **data_source):
    '''evolve_train runs an evolutionary algorithm to
    find the most fit elm.pipeline.Pipeline instances

    Parameters:
        pipe: elm.pipeline.Pipeline instance
        evo_params: the EvoParams instance, typically from
            from elm.model_selection import ea_setup
            evo_params = ea_setup(param_grid=param_grid,
                          param_grid_name='param_grid_example',
                          score_weights=[-1]) # minimization

        See also the help from (elm.pipeline.ensemble) where
        most arguments are interpretted similary.

    ''' + ensemble.__doc__
    models_share_sample = True
    method_kwargs = method_kwargs or {}
    scoring_kwargs = scoring_kwargs or {}
    get_func = _find_get_func_for_client(client)
    control = evo_params.deap_params['control']
    required_args, _, _ = get_args_kwargs_defaults(ea_general)
    evo_args = [
        evo_params,
    ]
    data_source = dict(X=X,
                       y=y,
                       sample_weight=sample_weight,
                       sampler=sampler,
                       args_list=args_list,
                       **data_source)
    fit_one_generation = partial(_on_each_generation, pipe, data_source,
                                 evo_params.deap_params, get_func,
                                 partial_fit_batches, method, method_kwargs)

    dsk = make_samples_dask(X, y, sample_weight, pipe, args_list, sampler,
                            data_source)
    sample_keys = list(dsk)
    if models_share_sample:
        np.random.shuffle(sample_keys)
        gen_to_sample_key = lambda gen: [sample_keys[gen]]
    else:
        gen_to_sample_key = lambda gen: sample_keys
    sample_keys = tuple(sample_keys)

    try:
        param_history = []
        for a in required_args[1:]:
            if a not in control:
                raise ValueError('Expected {} in {} (control kwargs '
                                 'to evolutionary '
                                 'algorithm)'.format(a, control))
            evo_args.append(control[a])
        ea_gen = ea_general(*evo_args)
        pop, _, _ = next(ea_gen)
        sample_keys_passed = gen_to_sample_key(0)

        def log_once(len_models, sample_keys_passed, gen):
            total_calls = len_models * len(
                sample_keys_passed) * partial_fit_batches
            msg = (len_models, len(sample_keys_passed), partial_fit_batches,
                   method, gen, total_calls)
            fmt = 'Evolve generation {4}: {0} models x {1} samples x {2} {3} calls = {5} calls in total'
            logger.info(fmt.format(*msg))

        log_once(len(pop), sample_keys_passed, 0)
        pop_names = [ind.name for ind in pop]
        models, fitnesses = fit_one_generation(dsk, 0, sample_keys_passed, pop)
        assign_check_fitness(pop, fitnesses, param_history,
                             evo_params.deap_params['choices'],
                             evo_params.score_weights)
        invalid_ind = True
        fitted_models = {n: m for n, (_, m) in zip(pop_names, models)}
        ngen = evo_params.deap_params['control'].get('ngen') or None
        if not ngen and not evo_params.early_stop:
            raise ValueError('param_grids: pg_name: control: has neither '
                             'ngen or early_stop keys')
        elif not ngen:
            ngen = 1000000
        for gen in range(ngen):
            # on last generation invalid_ind becomes None
            # and breaks this loop
            if models_share_sample:
                sample_keys_passed = (gen_to_sample_key(gen %
                                                        len(sample_keys)), )
            else:
                sample_keys_passed = sample_keys

            if gen > 0:
                log_once(len(invalid_ind), sample_keys_passed, gen)
                names = [ind.name for ind in invalid_ind]
                models, fitnesses = fit_one_generation(dsk, gen,
                                                       sample_keys_passed,
                                                       invalid_ind)
                fitted_models.update(
                    {n: m
                     for n, (_, m) in zip(names, models)})
            (pop, invalid_ind, param_history) = ea_gen.send(fitnesses)
            pop_names = [ind.name for ind in pop]
            fitted_models = {
                k: v
                for k, v in fitted_models.items() if k in pop_names
            }
            if not invalid_ind:
                break  # If there are no new solutions to try, break
        pop = evo_params.toolbox.select(pop, saved_ensemble_size)
        pop_names = [ind.name for ind in pop]
        models = [(k, v) for k, v in fitted_models.items() if k in pop_names]

    finally:
        columns = list(evo_params.deap_params['param_order'])
        columns += [
            'objective_{}_{}'.format(idx, 'min' if sw == -1 else 'max')
            for idx, sw in enumerate(evo_params.score_weights)
        ]
        if param_history:
            assert len(columns) == len(param_history[0])
            param_history = pd.DataFrame(np.array(param_history),
                                         columns=columns)
            param_history.to_csv(evo_params.history_file,
                                 index_label='parameter_set')
    return models
Ejemplo n.º 3
0
def ensemble(pipe,
             ngen,
             X=None,
             y=None,
             sample_weight=None,
             sampler=None,
             args_list=None,
             client=None,
             init_ensemble_size=1,
             saved_ensemble_size=None,
             ensemble_init_func=None,
             models_share_sample=True,
             model_selection=None,
             model_selection_kwargs=None,
             scoring_kwargs=None,
             method='fit',
             partial_fit_batches=1,
             classes=None,
             method_kwargs=None,
             **data_source):
    '''Fit or partial_fit an ensemble of models to a series of samples

    Call this function from an elm.pipeline.Pipeline instance's methods:
        "fit_ensemble"
        "fit_transform_ensemble"
        "transform_ensemble"

    Parameters:
        pipe: instance of elm.pipeline.Pipeline
        ngen: number of ensemble generations
        X:    earthio.ElmStore, if not using "sampler" and "args_list"
        y:    numpy array if not using "sampler" and "args_list", or None
              if not needed by Pipeline
        sample_weight: numpy array if not using "sampler" and "args_list", or None
              if not needed by Pipeline
        sampler: Callable - required if not giving X.  Called at least once
                on each element of args_list where each element is unpacked
                with *one_element_of_args_list
        args_list: List of args - required if not giving X.  See sampler above
        client: dask-distributed or ThreadPool client
        init_ensemble_size: number of ensemble members, ignored if giving
              ensemble_init_func
        saved_ensemble_size: how many members to keep at final generation
        ensemble_init_func: Callable to return list of elm.pipeline.Pipeline
             instances that initialize ensemble
        models_share_sample: If True, ensure that in each generation, every
             member is fit to the same sample.  If False, fit every model
             to every sample
        model_selection: Callable after each generation to take a list of
            (tag, Pipeline) tuples and return a list of new such tuples, or
            None or repeatedly train each model on each generation without
            replacement of model parameters
        model_selection_kwargs: kwargs passed to model_selection
        scoring_kwargs: kwargs that are passed to score_one_model.
                See also elm.model_selection.scoring
        method: This is the method of Pipeline that called this ensemble
            function, typically "fit"
        classes: Unique sequence of class integers passed to supervised
            classifiers that need the known y classes.
        method_kwargs: any other arguments to pass to method
        **data_source: keywords passed to "sampler" if given
    Returns:

        new_models: list of (tag, Pipeline instance) tuples on which
            "predict_many" can be called
    '''
    get_func = _find_get_func_for_client(client)
    fit_score_kwargs = method_kwargs or {}
    if not 'classes' in fit_score_kwargs and classes is not None:
        fit_score_kwargs['classes'] = classes
    model_selection_kwargs = model_selection_kwargs or {}
    ensemble_size = init_ensemble_size or 1
    partial_fit_batches = partial_fit_batches or 1
    if partial_fit_batches > 1:
        method = 'partial_fit'
    if not ensemble_init_func:
        models = tuple(copy.deepcopy(pipe) for _ in range(ensemble_size))
    else:
        ensemble_init_func = import_callable(ensemble_init_func)
        models = ensemble_init_func(pipe, ensemble_size=ensemble_size)
    logger.info("Init ensemble: {} members".format(len(models)))
    if model_selection:
        model_selection = import_callable(model_selection)
    final_names = []
    dsk = make_samples_dask(X, y, sample_weight, pipe, args_list, sampler,
                            data_source)
    models = tuple(
        zip(('tag_{}'.format(idx) for idx in range(len(models))), models))
    sample_keys = list(dsk)
    if models_share_sample:
        random.shuffle(sample_keys)
        gen_to_sample_key = {
            gen: s
            for gen, s in enumerate(sample_keys[:ngen])
        }
    sample_keys = tuple(sample_keys)
    for gen in range(ngen):
        if models_share_sample:
            sample_keys_passed = (gen_to_sample_key[gen % len(sample_keys)], )
        else:
            sample_keys_passed = sample_keys
        logger.info('Ensemble generation {} of {} - ({} estimators) '.format(
            gen + 1, ngen, len(models)))
        msg = (len(models), len(sample_keys_passed), partial_fit_batches,
               method,
               len(models) * len(sample_keys_passed) * partial_fit_batches,
               gen + 1, ngen)
        logger.info(
            'Ensemble Generation {5} of {6}: ({0} members x {1} samples x {2} calls) = {4} {3} calls this gen'
            .format(*msg))
        dsk, model_keys, new_models_name = _one_generation_dask_graph(
            dsk, models, fit_score_kwargs, sample_keys_passed,
            partial_fit_batches, gen, method)
        if get_func is None:
            new_models = tuple(dask.get(dsk, new_models_name))
        else:
            new_models = tuple(get_func(dsk, new_models_name))
        models = tuple(zip(model_keys, new_models))
        logger.info('Trained {} estimators'.format(len(models)))
        if model_selection:
            models = _run_model_selection(models, model_selection,
                                          model_selection_kwargs or {}, ngen,
                                          gen, scoring_kwargs)

        else:
            pass  # Just training all ensemble members
            # without replacing / re-ininializing / editing
            # the model params
    if saved_ensemble_size:
        final_models = models[:saved_ensemble_size]
    else:
        final_models = models

    return final_models