Beispiel #1
0
def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None):
    """Generate a scatter plot comparing the true and predicted values. Used for regression plotting

    Arguments:
        y_true (ww.DataColumn, pd.Series): The real target values of the data
        y_pred (ww.DataColumn, pd.Series): The predicted values outputted by the regression model.
        outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference
                                 between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow.
                                 Defaults to None

    Returns:
        plotly.Figure representing the predicted vs. actual values graph

    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    if outlier_threshold and outlier_threshold <= 0:
        raise ValueError(
            f"Threshold must be positive! Provided threshold is {outlier_threshold}"
        )

    df = get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold)
    data = []

    x_axis = _calculate_axis_range(df['prediction'])
    y_axis = _calculate_axis_range(df['actual'])
    x_y_line = [min(x_axis[0], y_axis[0]), max(x_axis[1], y_axis[1])]
    data.append(
        _go.Scatter(x=x_y_line,
                    y=x_y_line,
                    name="y = x line",
                    line_color='grey'))

    title = 'Predicted vs Actual Values Scatter Plot'
    layout = _go.Layout(title={'text': title},
                        xaxis={
                            'title': 'Prediction',
                            'range': x_y_line
                        },
                        yaxis={
                            'title': 'Actual',
                            'range': x_y_line
                        })

    for color, outlier_group in df.groupby('outlier'):
        if outlier_threshold:
            name = "< outlier_threshold" if color == "#0000ff" else ">= outlier_threshold"
        else:
            name = "Values"
        data.append(
            _go.Scatter(x=outlier_group['prediction'],
                        y=outlier_group['actual'],
                        mode='markers',
                        marker=_go.scatter.Marker(color=color),
                        name=name))
    return _go.Figure(layout=layout, data=data)
Beispiel #2
0
    def __init__(self, degree=1, random_seed=0, **kwargs):
        """Initialize the PolynomialDetrender.

        Arguments:
            degree (int): Degree for the polynomial. If 1, linear model is fit to the data.
                If 2, quadratic model is fit, etc. Default of 1.
            random_seed (int): Seed for the random number generator. Defaults to 0.
        """
        if not isinstance(degree, int):
            if isinstance(degree, float) and degree.is_integer():
                degree = int(degree)
            else:
                raise TypeError(
                    f"Parameter Degree must be an integer!: Received {type(degree).__name__}"
                )

        params = {"degree": degree}
        params.update(kwargs)
        error_msg = "sktime is not installed. Please install using 'pip install sktime'"

        trend = import_or_raise("sktime.forecasting.trend",
                                error_msg=error_msg)
        detrend = import_or_raise("sktime.transformations.series.detrend",
                                  error_msg=error_msg)

        detrender = detrend.Detrender(
            trend.PolynomialTrendForecaster(degree=degree))

        super().__init__(parameters=params,
                         component_obj=detrender,
                         random_seed=random_seed)
Beispiel #3
0
def graph_permutation_importance(pipeline,
                                 X,
                                 y,
                                 objective,
                                 importance_threshold=0):
    """Generate a bar graph of the pipeline's permutation importance.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame): The input data used to score and compute permutation importance
        y (ww.DataColumn, pd.Series): The target data
        objective (str, ObjectiveBase): Objective to score on
        importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero.

    Returns:
        plotly.Figure, a bar graph showing features and their respective permutation importance.
    """
    go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    perm_importance = calculate_permutation_importance(pipeline, X, y,
                                                       objective)
    perm_importance['importance'] = perm_importance['importance']

    if importance_threshold < 0:
        raise ValueError(
            f'Provided importance threshold of {importance_threshold} must be greater than or equal to 0'
        )
    # Remove features with close to zero importance
    perm_importance = perm_importance[
        abs(perm_importance['importance']) >= importance_threshold]
    # List is reversed to go from ascending order to descending order
    perm_importance = perm_importance.iloc[::-1]

    title = "Permutation Importance"
    subtitle = "The relative importance of each input feature's "\
               "overall influence on the pipelines' predictions, computed using "\
               "the permutation importance algorithm."
    data = [
        go.Bar(x=perm_importance['importance'],
               y=perm_importance['feature'],
               orientation='h')
    ]

    layout = {
        'title': '{0}<br><sub>{1}</sub>'.format(title, subtitle),
        'height': 800,
        'xaxis_title': 'Permutation Importance',
        'yaxis_title': 'Feature',
        'yaxis': {
            'type': 'category'
        }
    }

    fig = go.Figure(data=data, layout=layout)
    return fig
Beispiel #4
0
    def graph_feature_importance(self, importance_threshold=0):
        """Generate a bar graph of the pipeline's feature importance

        Arguments:
            importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero.

        Returns:
            plotly.Figure, a bar graph showing features and their corresponding importance
        """
        go = import_or_raise(
            "plotly.graph_objects",
            error_msg="Cannot find dependency plotly.graph_objects")
        if jupyter_check():
            import_or_raise("ipywidgets", warning=True)

        feat_imp = self.feature_importance
        feat_imp['importance'] = abs(feat_imp['importance'])

        if importance_threshold < 0:
            raise ValueError(
                f'Provided importance threshold of {importance_threshold} must be greater than or equal to 0'
            )

        # Remove features with importance whose absolute value is less than importance threshold
        feat_imp = feat_imp[feat_imp['importance'] >= importance_threshold]

        # List is reversed to go from ascending order to descending order
        feat_imp = feat_imp.iloc[::-1]

        title = 'Feature Importance'
        subtitle = 'May display fewer features due to feature selection'
        data = [
            go.Bar(x=feat_imp['importance'],
                   y=feat_imp['feature'],
                   orientation='h')
        ]

        layout = {
            'title': '{0}<br><sub>{1}</sub>'.format(title, subtitle),
            'height': 800,
            'xaxis_title': 'Feature Importance',
            'yaxis_title': 'Feature',
            'yaxis': {
                'type': 'category'
            }
        }

        fig = go.Figure(data=data, layout=layout)
        return fig
Beispiel #5
0
    def __init__(self,
                 n_estimators=10,
                 eta=0.03,
                 max_depth=6,
                 bootstrap_type=None,
                 silent=False,
                 allow_writing_files=False,
                 random_state=0,
                 **kwargs):
        random_seed = get_random_seed(random_state, self.SEED_MIN,
                                      self.SEED_MAX)
        parameters = {
            "n_estimators": n_estimators,
            "eta": eta,
            "max_depth": max_depth,
            'bootstrap_type': bootstrap_type,
            'silent': silent,
            'allow_writing_files': allow_writing_files
        }
        parameters.update(kwargs)

        cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
        catboost = import_or_raise("catboost", error_msg=cb_error_msg)
        # catboost will choose an intelligent default for bootstrap_type, so only set if provided
        cb_parameters = copy.copy(parameters)
        if bootstrap_type is None:
            cb_parameters.pop('bootstrap_type')
        cb_regressor = catboost.CatBoostRegressor(**cb_parameters,
                                                  random_seed=random_seed)
        super().__init__(parameters=parameters,
                         component_obj=cb_regressor,
                         random_state=random_state)
Beispiel #6
0
    def fit(self, X, y=None):
        if y is None:
            raise ValueError('ARIMA Regressor requires y as input.')

        p_error_msg = "ARIMA is not installed. Please install using `pip install statsmodels`."
        arima = import_or_raise("statsmodels.tsa.arima.model",
                                error_msg=p_error_msg)

        X, y = self._manage_woodwork(X, y)
        dates = self._get_dates_fit(X, y)
        X, y = self._match_indices(X, y, dates)
        new_params = {}
        for key, val in self.parameters.items():
            if key not in ['p', 'd', 'q']:
                new_params[key] = val
        if X is not None:
            arima_with_data = arima.ARIMA(endog=y,
                                          exog=X,
                                          dates=dates,
                                          **new_params)
        else:
            arima_with_data = arima.ARIMA(endog=y, dates=dates, **new_params)

        self._component_obj = arima_with_data.fit()
        return self
Beispiel #7
0
    def __init__(self, boosting_type="gbdt", learning_rate=0.1, n_estimators=20, max_depth=0, num_leaves=31,
                 min_child_samples=20, n_jobs=-1, random_seed=0,
                 bagging_fraction=0.9, bagging_freq=0, **kwargs):

        parameters = {"boosting_type": boosting_type,
                      "learning_rate": learning_rate,
                      "n_estimators": n_estimators,
                      "max_depth": max_depth,
                      "num_leaves": num_leaves,
                      "min_child_samples": min_child_samples,
                      "n_jobs": n_jobs,
                      "bagging_freq": bagging_freq,
                      "bagging_fraction": bagging_fraction}
        parameters.update(kwargs)
        lg_parameters = copy.copy(parameters)
        # when boosting type is random forest (rf), LightGBM requires bagging_freq == 1 and  0 < bagging_fraction < 1.0
        if boosting_type == "rf":
            lg_parameters['bagging_freq'] = 1
        # when boosting type is goss, LightGBM requires bagging_fraction == 1
        elif boosting_type == "goss":
            lg_parameters['bagging_fraction'] = 1
        # avoid lightgbm warnings having to do with parameter aliases
        if lg_parameters['bagging_freq'] is not None or lg_parameters['bagging_fraction'] is not None:
            lg_parameters.update({'subsample': None, 'subsample_freq': None})

        lgbm_error_msg = "LightGBM is not installed. Please install using `pip install lightgbm`."
        lgbm = import_or_raise("lightgbm", error_msg=lgbm_error_msg)
        self._ordinal_encoder = None

        lgbm_regressor = lgbm.sklearn.LGBMRegressor(random_state=random_seed, **lg_parameters)

        super().__init__(parameters=parameters,
                         component_obj=lgbm_regressor,
                         random_seed=random_seed)
    def __init__(self,
                 n_estimators=10,
                 eta=0.03,
                 max_depth=6,
                 bootstrap_type=None,
                 silent=True,
                 allow_writing_files=False,
                 random_state=None,
                 random_seed=0,
                 **kwargs):
        random_seed = deprecate_arg("random_state", "random_seed",
                                    random_state, random_seed)
        parameters = {
            "n_estimators": n_estimators,
            "eta": eta,
            "max_depth": max_depth,
            'bootstrap_type': bootstrap_type,
            'silent': silent,
            'allow_writing_files': allow_writing_files
        }
        parameters.update(kwargs)

        cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
        catboost = import_or_raise("catboost", error_msg=cb_error_msg)
        self._label_encoder = None
        # catboost will choose an intelligent default for bootstrap_type, so only set if provided
        cb_parameters = copy.copy(parameters)
        if bootstrap_type is None:
            cb_parameters.pop('bootstrap_type')
        cb_classifier = catboost.CatBoostClassifier(**cb_parameters,
                                                    random_seed=random_seed)
        super().__init__(parameters=parameters,
                         component_obj=cb_classifier,
                         random_seed=random_seed)
Beispiel #9
0
    def __init__(self,
                 sampler,
                 sampling_ratio=0.25,
                 k_neighbors=5,
                 n_jobs=-1,
                 random_seed=0,
                 **kwargs):
        """Initializes the oversampler component.

        Arguments:
            sampling_ratio (float): This is the goal ratio of the minority to majority class, with range (0, 1]. A value of 0.25 means we want a 1:4 ratio
                of the minority to majority class after oversampling. We will create the a sampling dictionary using this ratio, with the keys corresponding to the class
                and the values responding to the number of samples. Defaults to 0.25.
            k_neighbors (int): The number of nearest neighbors to used to construct synthetic samples. Defaults to 5.
            n_jobs (int): The number of CPU cores to use. Defaults to -1.
        """
        error_msg = "imbalanced-learn is not installed. Please install using 'pip install imbalanced-learn'"
        im = import_or_raise("imblearn.over_sampling", error_msg=error_msg)
        parameters = {
            "sampling_ratio": sampling_ratio,
            "k_neighbors": k_neighbors,
            "n_jobs": n_jobs
        }
        parameters.update(kwargs)
        self.sampler = {
            "SMOTE": im.SMOTE,
            "SMOTENC": im.SMOTENC,
            "SMOTEN": im.SMOTEN
        }[sampler]
        super().__init__(parameters=parameters,
                         component_obj=None,
                         random_seed=random_seed)
Beispiel #10
0
    def __init__(self, data):
        """Make plots for the AutoMLSearch class.

        Arguments:
            data (AutoMLSearch): Automated pipeline search object
        """
        self._go = import_or_raise(
            "plotly.graph_objects",
            error_msg="Cannot find dependency plotly.graph_objects")
        self.data = data
Beispiel #11
0
    def __init__(self, data, show_plot=True):
        self._go = import_or_raise(
            "plotly.graph_objects",
            error_msg="Cannot find dependency plotly.graph_objects")

        if jupyter_check():
            import_or_raise("ipywidgets", warning=True)

        self.data = data
        self.best_score_by_iter_fig = None
        self.curr_iteration_scores = list()
        self.best_iteration_scores = list()

        title = 'Pipeline Search: Iteration vs. {}<br><sub>Gray marker indicates the score at current iteration</sub>'.format(
            self.data.objective.name)
        data = [
            self._go.Scatter(x=[],
                             y=[],
                             mode='lines+markers',
                             name='Best Score'),
            self._go.Scatter(x=[],
                             y=[],
                             mode='markers',
                             name='Iter score',
                             marker={'color': 'gray'})
        ]
        layout = {
            'title': title,
            'xaxis': {
                'title': 'Iteration',
                'rangemode': 'tozero'
            },
            'yaxis': {
                'title': 'Score'
            }
        }
        self.best_score_by_iter_fig = self._go.FigureWidget(data, layout)
        self.best_score_by_iter_fig.update_layout(showlegend=False)
        self.update()
Beispiel #12
0
    def graph(self, filepath=None):
        """Generate an image representing the pipeline graph

        Arguments:
            filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph will not be saved.

        Returns:
            graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks.
        """
        graphviz = import_or_raise(
            'graphviz',
            error_msg='Please install graphviz to visualize pipelines.')

        # Try rendering a dummy graph to see if a working backend is installed
        try:
            graphviz.Digraph().pipe()
        except graphviz.backend.ExecutableNotFound:
            raise RuntimeError(
                "To graph entity sets, a graphviz backend is required.\n" +
                "Install the backend using one of the following commands:\n" +
                "  Mac OS: brew install graphviz\n" +
                "  Linux (Ubuntu): sudo apt-get install graphviz\n" +
                "  Windows: conda install python-graphviz\n")

        graph_format = None
        path_and_name = None
        if filepath:
            # Explicitly cast to str in case a Path object was passed in
            filepath = str(filepath)
            try:
                f = open(filepath, 'w')
                f.close()
            except (IOError, FileNotFoundError):
                raise ValueError(
                    ('Specified filepath is not writeable: {}'.format(filepath)
                     ))
            path_and_name, graph_format = os.path.splitext(filepath)
            graph_format = graph_format[1:].lower()  # ignore the dot
            supported_filetypes = graphviz.backend.FORMATS
            if graph_format not in supported_filetypes:
                raise ValueError((
                    "Unknown format '{}'. Make sure your format is one of the "
                    + "following: {}").format(graph_format,
                                              supported_filetypes))

        graph = self._component_graph.graph(path_and_name, graph_format)

        if filepath:
            graph.render(path_and_name, cleanup=True)

        return graph
Beispiel #13
0
    def __init__(self,
                 cols=None,
                 smoothing=1.0,
                 handle_unknown='value',
                 handle_missing='value',
                 random_seed=0,
                 **kwargs):
        """Initializes a transformer that encodes categorical features into target encodings.

        Arguments:
            cols (list): Columns to encode. If None, all string columns will be encoded, otherwise only the columns provided will be encoded.
                Defaults to None
            smoothing (float): The smoothing factor to apply. The larger this value is, the more influence the expected target value has
                on the resulting target encodings. Must be strictly larger than 0. Defaults to 1.0
            handle_unknown (string): Determines how to handle unknown categories for a feature encountered. Options are 'value', 'error', nd 'return_nan'.
                Defaults to 'value', which replaces with the target mean
            handle_missing (string): Determines how to handle missing values encountered during `fit` or `transform`. Options are 'value', 'error', and 'return_nan'.
                Defaults to 'value', which replaces with the target mean
            random_seed (int): Seed for the random number generator. Defaults to 0.
            """

        parameters = {
            "cols": cols,
            "smoothing": smoothing,
            "handle_unknown": handle_unknown,
            "handle_missing": handle_missing
        }
        parameters.update(kwargs)

        unknown_and_missing_input_options = ['error', 'return_nan', 'value']
        if handle_unknown not in unknown_and_missing_input_options:
            raise ValueError(
                "Invalid input '{}' for handle_unknown".format(handle_unknown))
        if handle_missing not in unknown_and_missing_input_options:
            raise ValueError(
                "Invalid input '{}' for handle_missing".format(handle_missing))
        if smoothing <= 0:
            raise ValueError(
                "Smoothing value needs to be strictly larger than 0. {} provided"
                .format(smoothing))

        category_encode = import_or_raise(
            'category_encoders',
            error_msg=
            'category_encoders not installed. Please install using `pip install category_encoders`'
        )
        super().__init__(
            parameters=parameters,
            component_obj=category_encode.target_encoder.TargetEncoder(
                **parameters),
            random_seed=random_seed)
    def __init__(self,
                 date_index=None,
                 trend=None,
                 start_p=2,
                 d=0,
                 start_q=2,
                 max_p=5,
                 max_d=2,
                 max_q=5,
                 seasonal=True,
                 n_jobs=-1,
                 random_seed=0,
                 **kwargs):
        """
        Arguments:
            date_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None.
            trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term,
                't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such
                as [1, 1, 0, 1].
            start_p (int): Minimum Autoregressive order.
            d (int): Minimum Differencing degree.
            start_q (int): Minimum Moving Average order.
            max_p (int): Maximum Autoregressive order.
            max_d (int): Maximum Differencing degree.
            max_q (int): Maximum Moving Average order.
            seasonal (bool): Whether to fit a seasonal model to ARIMA.
        """

        parameters = {
            'trend': trend,
            'start_p': start_p,
            'd': d,
            'start_q': start_q,
            'max_p': max_p,
            'max_d': max_d,
            'max_q': max_q,
            'seasonal': seasonal,
            "n_jobs": n_jobs,
            "date_index": date_index
        }

        parameters.update(kwargs)

        arima_model_msg = "sktime is not installed. Please install using `pip install sktime.`"
        sktime_arima = import_or_raise("sktime.forecasting.arima",
                                       error_msg=arima_model_msg)
        arima_model = sktime_arima.AutoARIMA(**parameters)

        super().__init__(parameters=parameters,
                         component_obj=arima_model,
                         random_seed=random_seed)
Beispiel #15
0
def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100):
    """Generates a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame): The input data used to score and compute scores
        y (ww.DataColumn, pd.Series): The target labels
        objective (ObjectiveBase obj, str): Objective used to score, shown on the y-axis of the graph
        steps (int): Number of intervals to divide and calculate objective score at

    Returns:
        plotly.Figure representing the objective score vs. threshold graph generated

    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    objective = get_objective(objective, return_instance=True)
    df = binary_objective_vs_threshold(pipeline, X, y, objective, steps)
    title = f'{objective.name} Scores vs. Thresholds'
    layout = _go.Layout(
        title={'text': title},
        xaxis={
            'title': 'Threshold',
            'range': _calculate_axis_range(df['threshold'])
        },
        yaxis={
            'title':
            f"{objective.name} Scores vs. Binary Classification Decision Threshold",
            'range': _calculate_axis_range(df['score'])
        })
    data = []
    data.append(
        _go.Scatter(x=df['threshold'], y=df['score'], line=dict(width=3)))
    return _go.Figure(layout=layout, data=data)
Beispiel #16
0
 def __init__(self,
              sampling_strategy='auto',
              test_size=None,
              n_jobs=-1,
              random_seed=0):
     error_msg = "imbalanced-learn is not installed. Please install using 'pip install imbalanced-learn'"
     im = import_or_raise("imblearn.combine", error_msg=error_msg)
     self.sampler = im.SMOTETomek(sampling_strategy=sampling_strategy,
                                  n_jobs=n_jobs,
                                  random_state=random_seed)
     super().__init__(sampler=self.sampler,
                      test_size=test_size,
                      split_type="TV",
                      random_seed=random_seed)
Beispiel #17
0
    def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, random_state=0, **kwargs):
        random_seed = get_random_seed(random_state, self.SEED_MIN, self.SEED_MAX)
        parameters = {"eta": eta,
                      "max_depth": max_depth,
                      "min_child_weight": min_child_weight,
                      "n_estimators": n_estimators}
        parameters.update(kwargs)
        xgb_error_msg = "XGBoost is not installed. Please install using `pip install xgboost.`"
        xgb = import_or_raise("xgboost", error_msg=xgb_error_msg)
        xgb_classifier = xgb.XGBClassifier(**parameters,
                                           random_state=random_seed)

        super().__init__(parameters=parameters,
                         component_obj=xgb_classifier,
                         random_state=random_state)
Beispiel #18
0
def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None):
    """Generate and display a precision-recall plot.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True binary labels.
        y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label.
        title_addition (str or None): If not None, append to plot title. Default None.

    Returns:
        plotly.Figure representing the precision-recall plot generated
    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)
    precision_recall_curve_data = precision_recall_curve(y_true, y_pred_proba)
    title = 'Precision-Recall{}'.format('' if title_addition is None else (
        ' ' + title_addition))
    layout = _go.Layout(title={'text': title},
                        xaxis={
                            'title': 'Recall',
                            'range': [-0.05, 1.05]
                        },
                        yaxis={
                            'title': 'Precision',
                            'range': [-0.05, 1.05]
                        })
    data = []
    data.append(
        _go.Scatter(x=precision_recall_curve_data['recall'],
                    y=precision_recall_curve_data['precision'],
                    name='Precision-Recall (AUC {:06f})'.format(
                        precision_recall_curve_data['auc_score']),
                    line=dict(width=3)))
    return _go.Figure(layout=layout, data=data)
Beispiel #19
0
 def __init__(self,
              sampling_strategy='auto',
              k_neighbors=2,
              test_size=None,
              random_seed=0,
              **kwargs):
     error_msg = "imbalanced-learn is not installed. Please install using 'pip install imbalanced-learn'"
     im = import_or_raise("imblearn.over_sampling", error_msg=error_msg)
     self.sampler = im.KMeansSMOTE(sampling_strategy=sampling_strategy,
                                   k_neighbors=k_neighbors,
                                   random_state=random_seed,
                                   **kwargs)
     super().__init__(sampler=self.sampler,
                      test_size=test_size,
                      split_type="TV",
                      random_seed=random_seed)
Beispiel #20
0
    def search_iteration_plot(self, interactive_plot=False):
        """Shows a plot of the best score at each iteration using data gathered during training.

        Returns:
            plot
        """
        if not interactive_plot:
            plot_obj = SearchIterationPlot(self.data)
            return self._go.Figure(plot_obj.best_score_by_iter_fig)
        try:
            ipython_display = import_or_raise(
                "IPython.display",
                error_msg="Cannot find dependency IPython.display")
            plot_obj = SearchIterationPlot(self.data)
            ipython_display.display(plot_obj.best_score_by_iter_fig)
            return plot_obj
        except ImportError:
            return self.search_iteration_plot(interactive_plot=False)
Beispiel #21
0
    def graph(self, name=None, graph_format=None):
        """Generate an image representing the component graph

        Arguments:
            name (str): Name of the graph. Defaults to None.
            graph_format (str): file format to save the graph in. Defaults to None.

        Returns:
            graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks.
        """
        graphviz = import_or_raise(
            'graphviz',
            error_msg='Please install graphviz to visualize pipelines.')

        # Try rendering a dummy graph to see if a working backend is installed
        try:
            graphviz.Digraph().pipe()
        except graphviz.backend.ExecutableNotFound:
            raise RuntimeError(
                "To visualize component graphs, a graphviz backend is required.\n"
                +
                "Install the backend using one of the following commands:\n" +
                "  Mac OS: brew install graphviz\n" +
                "  Linux (Ubuntu): sudo apt-get install graphviz\n" +
                "  Windows: conda install python-graphviz\n")

        graph = graphviz.Digraph(name=name,
                                 format=graph_format,
                                 graph_attr={'splines': 'ortho'})
        graph.attr(rankdir='LR')
        for component_name, component_class in self.component_instances.items(
        ):
            label = '%s\l' % (component_name)  # noqa: W605
            if isinstance(component_class, ComponentBase):
                parameters = '\l'.join([
                    key + ' : ' + "{:0.2f}".format(val) if
                    (isinstance(val, float)) else key + ' : ' + str(val)
                    for key, val in component_class.parameters.items()
                ])  # noqa: W605
                label = '%s |%s\l' % (component_name, parameters)  # noqa: W605
            graph.node(component_name, shape='record', label=label)
        edges = self._get_edges(self.component_dict)
        graph.edges(edges)
        return graph
Beispiel #22
0
def graph_prediction_vs_actual_over_time(pipeline, X, y, dates):
    """Plot the target values and predictions against time on the x-axis.

    Arguments:
        pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline.
        X (ww.DataTable, pd.DataFrame): Features used to generate new predictions.
        y (ww.DataColumn, pd.Series): Target values to compare predictions against.
        dates (ww.DataColumn, pd.Series): Dates corresponding to target values and predictions.

    Returns:
        plotly.Figure showing the prediction vs actual over time.
    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")

    if pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION:
        raise ValueError(
            "graph_prediction_vs_actual_over_time only supports time series regression pipelines! "
            f"Received {str(pipeline.problem_type)}.")

    data = get_prediction_vs_actual_over_time_data(pipeline, X, y, dates)

    data = [
        _go.Scatter(x=data["dates"],
                    y=data["target"],
                    mode='lines+markers',
                    name="Target",
                    line=dict(color='#1f77b4')),
        _go.Scatter(x=data["dates"],
                    y=data["prediction"],
                    mode='lines+markers',
                    name='Prediction',
                    line=dict(color='#d62728'))
    ]
    # Let plotly pick the best date format.
    layout = _go.Layout(title={'text': "Prediction vs Target over time"},
                        xaxis={'title': 'Time'},
                        yaxis={'title': 'Target Values and Predictions'})

    return _go.Figure(data=data, layout=layout)
 def _format_dates(self, dates, X, y, predict=False):
     if len(dates.shape) == 1:
         dates = pd.DataFrame(dates)
     if dates.shape[1] == 1:
         dates.set_index(dates.columns[0], drop=True, inplace=True)
         dates = pd.DatetimeIndex(dates.index)
     elif dates.shape[1] > 1:
         raise ValueError(
             f"The dates parameter should not consist of any additional data outside of the datetime information located in the index or in a column."
             f" Found {dates.shape[1]} columns.")
     freq = 'M' if pd.infer_freq(dates) == 'MS' else pd.infer_freq(dates)
     dates = dates.to_period(freq=freq)
     X, y = self._match_indices(X, y, dates)
     if predict:
         arima_model_msg = "sktime is not installed. Please install using `pip install sktime.`"
         forecasting_ = import_or_raise("sktime.forecasting.base",
                                        error_msg=arima_model_msg)
         fh_ = forecasting_.ForecastingHorizon(dates, is_relative=False)
         return X, y, fh_
     else:
         return X, y, None
Beispiel #24
0
    def __init__(self,
                 date_column=None,
                 trend='n',
                 p=1,
                 d=0,
                 q=0,
                 random_seed=0,
                 **kwargs):
        """
        Arguments:
            date_column (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None.
            trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term,
                't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such
                as [1, 1, 0, 1].
            p (int or list(int)): Autoregressive order.
            d (int): Differencing degree.
            q (int or list(int)): Moving Average order.
        """
        order = (p, d, q)
        parameters = {'order': order, 'trend': trend}

        parameters.update(kwargs)
        self.date_column = date_column

        p_error_msg = "ARIMA is not installed. Please install using `pip install statsmodels`."

        arima = import_or_raise("statsmodels.tsa.arima.model",
                                error_msg=p_error_msg)
        try:
            sum_p = sum(p) if isinstance(p, list) else p
            sum_q = sum(q) if isinstance(q, list) else q
            arima.ARIMA(endog=np.zeros(sum_p + d + sum_q + 1), **parameters)
        except TypeError:
            raise TypeError(
                "Unable to instantiate ARIMA due to an unexpected argument")
        parameters.update({'p': p, 'd': d, 'q': q})

        super().__init__(parameters=parameters,
                         component_obj=None,
                         random_seed=random_seed)
Beispiel #25
0
 def __init__(self,
              categorical_features=None,
              sampling_strategy='auto',
              test_size=None,
              n_jobs=-1,
              random_seed=0):
     error_msg = "imbalanced-learn is not installed. Please install using 'pip install imbalanced-learn'"
     im = import_or_raise("imblearn.over_sampling", error_msg=error_msg)
     if not _allowed_categorical(categorical_features):
         raise ValueError(
             f"Categorical feature array must be a list with values and must not all be True, received {categorical_features}"
         )
     self.categorical_features = categorical_features
     self.sampler = im.SMOTENC(
         categorical_features=self.categorical_features,
         sampling_strategy=sampling_strategy,
         n_jobs=n_jobs,
         random_state=random_seed)
     super().__init__(sampler=self.sampler,
                      test_size=test_size,
                      split_type="TV",
                      random_seed=random_seed)
Beispiel #26
0
def graph_partial_dependence(pipeline,
                             X,
                             feature,
                             class_label=None,
                             grid_resolution=100):
    """Create an one-way partial dependence plot.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame, np.ndarray): The input data used to generate a grid of values
            for feature where partial dependence will be calculated at
        feature (int, string): The target feature for which to create the partial dependence plot for.
            If feature is an int, it must be the index of the feature to use.
            If feature is a string, it must be a valid column name in X.
        class_label (string, optional): Name of class to plot for multiclass problems. If None, will plot
            the partial dependence for each class. This argument does not change behavior for regression or binary
            classification pipelines. For binary classification, the partial dependence for the positive label will
            always be displayed. Defaults to None.

    Returns:
        pd.DataFrame: pd.DataFrame with averaged predictions for all points in the grid averaged
            over all samples of X and the values used to calculate those predictions.

    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)
    if isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline
                  ) and class_label is not None:
        if class_label not in pipeline.classes_:
            msg = f"Class {class_label} is not one of the classes the pipeline was fit on: {', '.join(list(pipeline.classes_))}"
            raise ValueError(msg)

    part_dep = partial_dependence(pipeline,
                                  X,
                                  feature=feature,
                                  grid_resolution=grid_resolution)
    feature_name = str(feature)
    title = f"Partial Dependence of '{feature_name}'"
    layout = _go.Layout(title={'text': title},
                        xaxis={'title': f'{feature_name}'},
                        yaxis={'title': 'Partial Dependence'},
                        showlegend=False)
    if isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline):
        class_labels = [class_label
                        ] if class_label is not None else pipeline.classes_
        _subplots = import_or_raise(
            "plotly.subplots",
            error_msg="Cannot find dependency plotly.graph_objects")

        # If the user passes in a value for class_label, we want to create a 1 x 1 subplot or else there would
        # be an empty column in the plot and it would look awkward
        rows, cols = ((len(class_labels) + 1) // 2,
                      2) if len(class_labels) > 1 else (1, len(class_labels))

        # Don't specify share_xaxis and share_yaxis so that we get tickmarks in each subplot
        fig = _subplots.make_subplots(rows=rows,
                                      cols=cols,
                                      subplot_titles=class_labels)
        for i, label in enumerate(class_labels):

            # Plotly trace indexing begins at 1 so we add 1 to i
            fig.add_trace(_go.Scatter(
                x=part_dep.loc[part_dep.class_label == label,
                               'feature_values'],
                y=part_dep.loc[part_dep.class_label == label,
                               'partial_dependence'],
                line=dict(width=3),
                name=label),
                          row=(i + 2) // 2,
                          col=(i % 2) + 1)
        fig.update_layout(layout)
        fig.update_xaxes(title=f'{feature_name}',
                         range=_calculate_axis_range(
                             part_dep['feature_values']))
        fig.update_yaxes(
            range=_calculate_axis_range(part_dep['partial_dependence']))
    else:
        trace = _go.Scatter(x=part_dep['feature_values'],
                            y=part_dep['partial_dependence'],
                            name='Partial Dependence',
                            line=dict(width=3))
        fig = _go.Figure(layout=layout, data=[trace])

    return fig
Beispiel #27
0
def visualize_decision_tree(estimator,
                            max_depth=None,
                            rotate=False,
                            filled=False,
                            filepath=None):
    """Generate an image visualizing the decision tree

    Arguments:
        estimator (ComponentBase): A fitted DecisionTree-based estimator.
        max_depth (int, optional): The depth to which the tree should be displayed. If set to None (as by default),
        tree is fully generated.
        rotate (bool, optional): Orient tree left to right rather than top-down.
        filled (bool, optional): Paint nodes to indicate majority class for classification, extremity of values for
        regression, or purity of node for multi-output.
        filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph
        will not be saved.

    Returns:
        graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks.
    """
    if not estimator.model_family == ModelFamily.DECISION_TREE:
        raise ValueError(
            "Tree visualizations are only supported for decision tree estimators"
        )
    if max_depth and (not isinstance(max_depth, int) or not max_depth >= 0):
        raise ValueError(
            "Unknown value: '{}'. The parameter max_depth has to be a non-negative integer"
            .format(max_depth))
    if not estimator._is_fitted:
        raise NotFittedError(
            "This DecisionTree estimator is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
        )

    est = estimator._component_obj

    graphviz = import_or_raise(
        'graphviz', error_msg='Please install graphviz to visualize trees.')

    graph_format = None
    if filepath:
        # Cast to str in case a Path object was passed in
        filepath = str(filepath)
        try:
            f = open(filepath, 'w')
            f.close()
        except (IOError, FileNotFoundError):
            raise ValueError(
                ('Specified filepath is not writeable: {}'.format(filepath)))
        path_and_name, graph_format = os.path.splitext(filepath)
        if graph_format:
            graph_format = graph_format[1:].lower()  # ignore the dot
            supported_filetypes = graphviz.backend.FORMATS
            if graph_format not in supported_filetypes:
                raise ValueError((
                    "Unknown format '{}'. Make sure your format is one of the "
                    + "following: {}").format(graph_format,
                                              supported_filetypes))
        else:
            graph_format = 'pdf'  # If the filepath has no extension default to pdf

    dot_data = export_graphviz(decision_tree=est,
                               max_depth=max_depth,
                               rotate=rotate,
                               filled=filled)
    source_obj = graphviz.Source(source=dot_data, format=graph_format)
    if filepath:
        source_obj.render(filename=path_and_name, cleanup=True)

    return source_obj
Beispiel #28
0
def graph_confusion_matrix(y_true,
                           y_pred,
                           normalize_method='true',
                           title_addition=None):
    """Generate and display a confusion matrix plot.

    If `normalize_method` is set, hover text will show raw count, otherwise hover text will show count normalized with method 'true'.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True binary labels.
        y_pred (ww.DataColumn, pd.Series or np.ndarray): Predictions from a binary classifier.
        normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'.
        title_addition (str or None): if not None, append to plot title. Defaults to None.

    Returns:
        plotly.Figure representing the confusion matrix plot generated
    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    _ff = import_or_raise(
        "plotly.figure_factory",
        error_msg="Cannot find dependency plotly.figure_factory")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    conf_mat = confusion_matrix(y_true, y_pred, normalize_method=None)
    conf_mat_normalized = confusion_matrix(y_true,
                                           y_pred,
                                           normalize_method=normalize_method
                                           or 'true')
    labels = conf_mat.columns.tolist()

    title = 'Confusion matrix{}{}'.format(
        '' if title_addition is None else (' ' + title_addition),
        '' if normalize_method is None else
        (', normalized using method "' + normalize_method + '"'))
    z_data, custom_data = (
        conf_mat, conf_mat_normalized) if normalize_method is None else (
            conf_mat_normalized, conf_mat)
    z_data = z_data.to_numpy()
    z_text = [["{:.3f}".format(y) for y in x] for x in z_data]
    primary_heading, secondary_heading = (
        'Raw', 'Normalized') if normalize_method is None else ('Normalized',
                                                               'Raw')
    hover_text = '<br><b>' + primary_heading + ' Count</b>: %{z}<br><b>' + secondary_heading + ' Count</b>: %{customdata} <br>'
    # the "<extra> tags at the end are necessary to remove unwanted trace info
    hover_template = '<b>True</b>: %{y}<br><b>Predicted</b>: %{x}' + hover_text + '<extra></extra>'
    layout = _go.Layout(title={'text': title},
                        xaxis={
                            'title': 'Predicted Label',
                            'type': 'category',
                            'tickvals': labels
                        },
                        yaxis={
                            'title': 'True Label',
                            'type': 'category',
                            'tickvals': labels
                        })
    fig = _ff.create_annotated_heatmap(z_data,
                                       x=labels,
                                       y=labels,
                                       annotation_text=z_text,
                                       customdata=custom_data,
                                       hovertemplate=hover_template,
                                       colorscale='Blues',
                                       showscale=True)
    fig.update_layout(layout)
    # put xaxis text on bottom to not overlap with title
    fig['layout']['xaxis'].update(side='bottom')
    # plotly Heatmap y axis defaults to the reverse of what we want: https://community.plotly.com/t/heatmap-y-axis-is-reversed-by-default-going-against-standard-convention-for-matrices/32180
    fig.update_yaxes(autorange="reversed")
    return fig
Beispiel #29
0
def graph_roc_curve(y_true,
                    y_pred_proba,
                    custom_class_names=None,
                    title_addition=None):
    """Generate and display a Receiver Operating Characteristic (ROC) plot for binary and multiclass classification problems.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True labels.
        y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case.
        custom_class_labels (list or None): If not None, custom labels for classes. Default None.
        title_addition (str or None): if not None, append to plot title. Default None.

    Returns:
        plotly.Figure representing the ROC plot generated
    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    title = 'Receiver Operating Characteristic{}'.format(
        '' if title_addition is None else (' ' + title_addition))
    layout = _go.Layout(title={'text': title},
                        xaxis={
                            'title': 'False Positive Rate',
                            'range': [-0.05, 1.05]
                        },
                        yaxis={
                            'title': 'True Positive Rate',
                            'range': [-0.05, 1.05]
                        })

    all_curve_data = roc_curve(y_true, y_pred_proba)
    graph_data = []

    n_classes = len(all_curve_data)

    if custom_class_names and len(custom_class_names) != n_classes:
        raise ValueError(
            'Number of custom class names does not match number of classes')

    for i in range(n_classes):
        roc_curve_data = all_curve_data[i]
        name = i + 1 if custom_class_names is None else custom_class_names[i]
        graph_data.append(
            _go.Scatter(
                x=roc_curve_data['fpr_rates'],
                y=roc_curve_data['tpr_rates'],
                hovertemplate=
                "(False Postive Rate: %{x}, True Positive Rate: %{y})<br>" +
                "Threshold: %{text}",
                name=f"Class {name} (AUC {roc_curve_data['auc_score']:.06f})",
                text=roc_curve_data["thresholds"],
                line=dict(width=3)))
    graph_data.append(
        _go.Scatter(x=[0, 1],
                    y=[0, 1],
                    name='Trivial Model (AUC 0.5)',
                    line=dict(dash='dash')))
    return _go.Figure(layout=layout, data=graph_data)
Beispiel #30
0
def _get_preprocessing_components(X, y, problem_type, estimator_class, sampler_name=None):
    """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data.

    Arguments:
        X (ww.DataTable): The input data of shape [n_samples, n_features]
        y (ww.DataColumn): The target data of length [n_samples]
        problem_type (ProblemTypes or str): Problem type
        estimator_class (class): A class which subclasses Estimator estimator for pipeline,
        sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None

    Returns:
        list[Transformer]: A list of applicable preprocessing components to use with the estimator
    """

    X_pd = X.to_dataframe()
    pp_components = []
    all_null_cols = X_pd.columns[X_pd.isnull().all()]
    if len(all_null_cols) > 0:
        pp_components.append(DropNullColumns)
    input_logical_types = set(X.logical_types.values())
    types_imputer_handles = {logical_types.Boolean, logical_types.Categorical, logical_types.Double, logical_types.Integer}
    if len(input_logical_types.intersection(types_imputer_handles)) > 0:
        pp_components.append(Imputer)

    text_columns = list(X.select('natural_language').columns)
    if len(text_columns) > 0:
        pp_components.append(TextFeaturizer)

    index_columns = list(X.select('index').columns)
    if len(index_columns) > 0:
        pp_components.append(DropColumns)

    datetime_cols = X.select(["Datetime"])
    add_datetime_featurizer = len(datetime_cols.columns) > 0
    if add_datetime_featurizer and estimator_class.model_family != ModelFamily.ARIMA:
        pp_components.append(DateTimeFeaturizer)

    if is_time_series(problem_type) and estimator_class.model_family != ModelFamily.ARIMA:
        pp_components.append(DelayedFeatureTransformer)

    categorical_cols = X.select('category')
    if len(categorical_cols.columns) > 0 and estimator_class not in {CatBoostClassifier, CatBoostRegressor}:
        pp_components.append(OneHotEncoder)

    sampler_components = {
        "Undersampler": Undersampler,
        "SMOTE Oversampler": SMOTESampler,
        "SMOTENC Oversampler": SMOTENCSampler,
        "SMOTEN Oversampler": SMOTENSampler
    }
    if sampler_name is not None:
        try:
            import_or_raise("imblearn.over_sampling", error_msg="imbalanced-learn is not installed")
            pp_components.append(sampler_components[sampler_name])
        except ImportError:
            logger.debug(f'Could not import imblearn.over_sampling, so defaulting to use Undersampler')
            pp_components.append(Undersampler)

    if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
        pp_components.append(StandardScaler)

    return pp_components