Esempio n. 1
0
def _make_stacked_ensemble_pipeline(input_pipelines, problem_type, n_jobs=-1, random_seed=0):
    """
    Creates a pipeline with a stacked ensemble estimator.

    Arguments:
        input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators for the stacked ensemble.
            This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised.
        problem_type (ProblemType): problem type of pipeline
        n_jobs (int or None): Integer describing level of parallelism used for pipelines.
            None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
            Defaults to -1.

    Returns:
        Pipeline with appropriate stacked ensemble estimator.
    """
    parameters = {}
    if is_classification(problem_type):
        parameters = {"Stacked Ensemble Classifier": {"input_pipelines": input_pipelines, "n_jobs": n_jobs}}
        estimator = StackedEnsembleClassifier
    else:
        parameters = {"Stacked Ensemble Regressor": {"input_pipelines": input_pipelines, "n_jobs": n_jobs}}
        estimator = StackedEnsembleRegressor

    pipeline_class, pipeline_name = {
        ProblemTypes.BINARY: (BinaryClassificationPipeline, "Stacked Ensemble Classification Pipeline"),
        ProblemTypes.MULTICLASS: (MulticlassClassificationPipeline, "Stacked Ensemble Classification Pipeline"),
        ProblemTypes.REGRESSION: (RegressionPipeline, "Stacked Ensemble Regression Pipeline")}[problem_type]

    return pipeline_class([estimator], parameters=parameters,
                          custom_name=pipeline_name,
                          random_seed=random_seed)
Esempio n. 2
0
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0):
    """Splits data into train and test sets.

    Arguments:
        X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features]
        y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples]
        problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
        problem_configuration (dict): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the date_index, gap, and max_delay variables.
        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets
    """

    X = infer_feature_types(X)
    y = infer_feature_types(y)

    data_splitter = None
    if is_time_series(problem_type):
        data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed)
    elif is_regression(problem_type):
        data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)
    elif is_classification(problem_type):
        data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)

    train, test = next(data_splitter.split(X.to_dataframe(), y.to_series()))

    X_train = X.iloc[train]
    X_test = X.iloc[test]
    y_train = y.iloc[train]
    y_test = y.iloc[test]

    return X_train, X_test, y_train, y_test
Esempio n. 3
0
def test_type_checks(problem_type):
    assert is_regression(problem_type) == (problem_type in [
        ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION
    ])
    assert is_binary(problem_type) == (problem_type in [
        ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY
    ])
    assert is_multiclass(problem_type) == (problem_type in [
        ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS
    ])
    assert is_classification(problem_type) == (problem_type in [
        ProblemTypes.BINARY, ProblemTypes.MULTICLASS,
        ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS
    ])
    assert is_time_series(problem_type) == (problem_type in [
        ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS,
        ProblemTypes.TIME_SERIES_REGRESSION
    ])
Esempio n. 4
0
def make_pipeline(X, y, estimator, problem_type, parameters=None, custom_hyperparameters=None, sampler_name=None):
    """Given input data, target data, an estimator class and the problem type,
        generates a pipeline class with a preprocessing chain which was recommended based on the inputs.
        The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type.

   Arguments:
        X (pd.DataFrame, ww.DataTable): The input data of shape [n_samples, n_features]
        y (pd.Series, ww.DataColumn): The target data of length [n_samples]
        estimator (Estimator): Estimator for pipeline
        problem_type (ProblemTypes or str): Problem type for pipeline to generate
        parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
            An empty dictionary or None implies using all default values for component parameters.
        custom_hyperparameters (dictionary): Dictionary of custom hyperparameters,
            with component name as key and dictionary of parameters as the value
        sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems.
            Defaults to None

    Returns:
        PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator

    """
    X = infer_feature_types(X)
    y = infer_feature_types(y)

    problem_type = handle_problem_types(problem_type)
    if estimator not in get_estimators(problem_type):
        raise ValueError(f"{estimator.name} is not a valid estimator for problem type")
    if not is_classification(problem_type) and sampler_name is not None:
        raise ValueError(f"Sampling is unsupported for problem_type {str(problem_type)}")
    preprocessing_components = _get_preprocessing_components(X, y, problem_type, estimator, sampler_name)
    complete_component_graph = preprocessing_components + [estimator]

    if custom_hyperparameters and not isinstance(custom_hyperparameters, dict):
        raise ValueError(f"if custom_hyperparameters provided, must be dictionary. Received {type(custom_hyperparameters)}")

    base_class = _get_pipeline_base_class(problem_type)
    return base_class(complete_component_graph, parameters=parameters, custom_hyperparameters=custom_hyperparameters)
Esempio n. 5
0
    def train_and_score_pipeline(pipeline, automl, full_X_train, full_y_train):
        """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores

        Arguments:
            pipeline (PipelineBase): The pipeline to score
            automl (AutoMLSearch): The AutoML search, used to access config and for the error callback
            full_X_train (ww.DataTable): Training features
            full_y_train (ww.DataColumn): Training target

        Returns:
            dict: A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details.
        """
        start = time.time()
        cv_data = []
        logger.info("\tStarting cross validation")
        X_pd = _convert_woodwork_types_wrapper(full_X_train.to_dataframe())
        y_pd = _convert_woodwork_types_wrapper(full_y_train.to_series())
        y_pd_encoded = y_pd
        # Encode target for classification problems so that we can support float targets. This is okay because we only use split to get the indices to split on
        if is_classification(automl.problem_type):
            y_mapping = {
                original_target: encoded_target
                for (encoded_target,
                     original_target) in enumerate(y_pd.value_counts().index)
            }
            y_pd_encoded = y_pd.map(y_mapping)
        for i, (train, valid) in enumerate(
                automl.data_splitter.split(X_pd, y_pd_encoded)):
            if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0:
                # Stacked ensembles do CV internally, so we do not run CV here for performance reasons.
                logger.debug(
                    f"Skipping fold {i} because CV for stacked ensembles is not supported."
                )
                break
            logger.debug(f"\t\tTraining and scoring on fold {i}")
            X_train, X_valid = full_X_train.iloc[train], full_X_train.iloc[
                valid]
            y_train, y_valid = full_y_train.iloc[train], full_y_train.iloc[
                valid]
            if is_binary(automl.problem_type) or is_multiclass(
                    automl.problem_type):
                diff_train = set(
                    np.setdiff1d(full_y_train.to_series(),
                                 y_train.to_series()))
                diff_valid = set(
                    np.setdiff1d(full_y_train.to_series(),
                                 y_valid.to_series()))
                diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else ""
                diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else ""
                if diff_string:
                    raise Exception(diff_string)
            objectives_to_score = [automl.objective
                                   ] + automl.additional_objectives
            cv_pipeline = None
            try:
                logger.debug(f"\t\t\tFold {i}: starting training")
                cv_pipeline = EngineBase.train_pipeline(
                    pipeline, X_train, y_train, automl.optimize_thresholds,
                    automl.objective)
                logger.debug(f"\t\t\tFold {i}: finished training")
                if automl.optimize_thresholds and pipeline.can_tune_threshold_with_objective(
                        automl.objective
                ) and automl.objective.can_optimize_threshold:
                    logger.debug(
                        f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})"
                    )
                logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline")
                scores = cv_pipeline.score(X_valid,
                                           y_valid,
                                           objectives=objectives_to_score)
                logger.debug(
                    f"\t\t\tFold {i}: {automl.objective.name} score: {scores[automl.objective.name]:.3f}"
                )
                score = scores[automl.objective.name]
            except Exception as e:
                if automl.error_callback is not None:
                    automl.error_callback(exception=e,
                                          traceback=traceback.format_tb(
                                              sys.exc_info()[2]),
                                          automl=automl,
                                          fold_num=i,
                                          pipeline=pipeline)
                if isinstance(e, PipelineScoreError):
                    nan_scores = {
                        objective: np.nan
                        for objective in e.exceptions
                    }
                    scores = {**nan_scores, **e.scored_successfully}
                    scores = OrderedDict({
                        o.name: scores[o.name]
                        for o in [automl.objective] +
                        automl.additional_objectives
                    })
                    score = scores[automl.objective.name]
                else:
                    score = np.nan
                    scores = OrderedDict(
                        zip([n.name for n in automl.additional_objectives],
                            [np.nan] * len(automl.additional_objectives)))

            ordered_scores = OrderedDict()
            ordered_scores.update({automl.objective.name: score})
            ordered_scores.update(scores)
            ordered_scores.update({"# Training": y_train.shape[0]})
            ordered_scores.update({"# Validation": y_valid.shape[0]})

            evaluation_entry = {
                "all_objective_scores": ordered_scores,
                "score": score,
                'binary_classification_threshold': None
            }
            if is_binary(
                    automl.problem_type
            ) and cv_pipeline is not None and cv_pipeline.threshold is not None:
                evaluation_entry[
                    'binary_classification_threshold'] = cv_pipeline.threshold
            cv_data.append(evaluation_entry)
        training_time = time.time() - start
        cv_scores = pd.Series([fold['score'] for fold in cv_data])
        cv_score_mean = cv_scores.mean()
        logger.info(
            f"\tFinished cross validation - mean {automl.objective.name}: {cv_score_mean:.3f}"
        )
        return {
            'cv_data': cv_data,
            'training_time': training_time,
            'cv_scores': cv_scores,
            'cv_score_mean': cv_score_mean
        }