def transform_ct(x_in, model, encoding):
    """
    Transform when using a ColumnsTransformer.

    As ColumnsTransformer output hstack the result of transformers, if the TOP-preprocessed data are re-ordered
    after the ColumnTransformer the inverse transform must return false result.

    We successively apply the transformers with columns position. That's why colnames
    are prefixed by the transformers names.

    Parameters
    ----------
    x_in : pandas.DataFrame
        Raw dataset to apply preprocessing
    model: model object
        model used to check the different values of target estimate predict_proba
    encoding : list
        The list must contain a single ColumnsTransformer and an optional list of dict.

    Returns
    -------
    pandas.Dataframe
        The data preprocessed for the given list of encoding.
    """
    if str(type(encoding)) == columntransformer:
        # We use inverse tranform from the encoding method base on columns position
        if str(type(model)) in sklearn_model:
            rst = pd.DataFrame(encoding.transform(x_in), index=x_in.index)
            rst.columns = ["col_" + str(feature) for feature in rst.columns]

        elif str(type(model)) in other_model:
            rst = pd.DataFrame(encoding.transform(x_in),
                               columns=extract_features_model(
                                   model,
                                   dict_model_feature[str(type(model))]),
                               index=x_in.index)
        else:
            raise ValueError("Model specified isn't supported by Shapash.")

    elif str(type(encoding)) == "<class 'list'>":
        rst = transform_ordinal(x_in, encoding)

    else:
        raise Exception(
            f"{encoding.__class__.__name__} not supported, no preprocessing done."
        )

    return rst
Beispiel #2
0
def check_consistency_model_features(features_dict,
                                     model,
                                     columns_dict,
                                     features_types,
                                     mask_params=None,
                                     preprocessing=None,
                                     postprocessing=None,
                                     list_preprocessing=None,
                                     features_groups=None):
    """
    Check the matching between attributes, features names are same, or include

    Parameters
    ----------
    features_dict: dict
        Dictionary mapping technical feature names to domain names.
    model: model object
        model used to check the different values of target estimate predict_proba
    columns_dict: dict
        Dictionary mapping integer column number (in the same order of the trained dataset) to technical feature names.
    features_types: dict
        Dictionnary mapping features with the right types needed.
    preprocessing: category_encoders, ColumnTransformer, list or dict (optional)
            The processing apply to the original data
    mask_params: dict (optional)
        Dictionnary allowing the user to define a apply a filter to summarize the local explainability.
    postprocessing : dict
        Dictionnary of postprocessing that need to be checked.
    list_preprocessing: list (optional)
         list containing all preprocessing.
    features_groups: list (optional)
         list containing all groups of features.
    """
    # Features dict can include additional entries for groups of features.
    # We don't want to check them here as they may not be in other dict
    features_dict = copy.deepcopy(features_dict)
    if features_groups is not None:
        for feat in features_groups.keys():
            if feat in features_dict.keys():
                features_dict.pop(feat)

    if features_dict is not None:
        if not all(feat in features_types for feat in features_dict):
            raise ValueError(
                "All features of features_dict must be in features_types")

    if set(features_types) != set(columns_dict.values()):
        raise ValueError(
            "features of features_types and columns_dict must be the same")

    if mask_params is not None:
        if mask_params['features_to_hide'] is not None:
            if not all(feature in set(features_types)
                       for feature in mask_params['features_to_hide']):
                raise ValueError(
                    "All features of mask_params must be in model")

    if preprocessing is not None and str(
            type(preprocessing)) in (supported_category_encoder):
        if not all(feature in set(columns_dict.values())
                   for feature in set(preprocessing.cols)):
            raise ValueError(
                "All features of preprocessing must be in columns_dict")

    model_features = extract_features_model(
        model, dict_model_feature[str(type(model))])
    if isinstance(model_features, list):
        feature_expected_model = model_features
        model_expected = len(set(model_features))
    else:
        feature_expected_model = None
        model_expected = model_features

    if preprocessing is None:
        if isinstance(feature_expected_model, list):
            if set(columns_dict.values()) != set(feature_expected_model):
                columns_dict_feature = [
                    str(feature) for feature in columns_dict.values()
                ]
                if set(columns_dict_feature) != set(feature_expected_model):
                    raise ValueError(
                        "Features of columns_dict and model must be the same.")
        else:
            if len(set(columns_dict.values())) != model_expected:
                raise ValueError(
                    "Features of columns_dict and model must have the same length"
                )

    if str(type(preprocessing)) in supported_category_encoder and isinstance(
            feature_expected_model, list):
        if set(preprocessing.feature_names) != set(feature_expected_model):
            raise ValueError("""
                                One of features returned by the Category_Encoders preprocessing doesn't
                                match the model's expected features.
                            """)
    elif preprocessing is not None:
        feature_encoded = get_list_features_names(list_preprocessing,
                                                  columns_dict)
        if model_expected != len(feature_encoded):
            raise ValueError("""
                Number of features returned by the preprocessing step doesn't
                match the model's expected features.
                        """)

    if postprocessing:
        if not isinstance(postprocessing, dict):
            raise ValueError("Postprocessing parameter must be a dictionnary")
        for feature in postprocessing.keys():
            if feature not in features_types.keys():
                raise ValueError(
                    "Postprocessing and features_types must have the same features names."
                )
            if feature not in columns_dict.values():
                raise ValueError(
                    "Postprocessing and columns_dict must have the same features names."
                )
        check_postprocessing(features_types, postprocessing)
Beispiel #3
0
def check_consistency_model_features(features_dict,
                                     model,
                                     columns_dict,
                                     features_types,
                                     mask_params=None,
                                     preprocessing=None,
                                     postprocessing=None):
    """
    Check the matching between attributes, features names are same, or include

    Parameters
    ----------
    features_dict: dict
        Dictionary mapping technical feature names to domain names.
    model: model object
        model used to check the different values of target estimate predict_proba
    columns_dict: dict
        Dictionary mapping integer column number (in the same order of the trained dataset) to technical feature names.
    features_types: dict
        Dictionnary mapping features with the right types needed.
    preprocessing: category_encoders, ColumnTransformer, list or dict (optional)
            The processing apply to the original data
    mask_params: dict (optional)
        Dictionnary allowing the user to define a apply a filter to summarize the local explainability.
    postprocessing : dict
        Dictionnary of postprocessing that need to be checked.
    """
    if features_dict is not None:
        if not all(feat in features_types for feat in features_dict):
            raise ValueError(
                "All features of features_dict must be in features_types")

    if set(features_types) != set(columns_dict.values()):
        raise ValueError(
            "features of features_types and model must be the same")

    if mask_params is not None:
        if mask_params['features_to_hide'] is not None:
            if not all(feature in set(features_types)
                       for feature in mask_params['features_to_hide']):
                raise ValueError(
                    "All features of mask_params must be in model")

    if preprocessing is not None and str(
            type(preprocessing)) in (supported_category_encoder,
                                     supported_sklearn):
        if not all(feature in set(columns_dict.values())
                   for feature in set(preprocessing.cols)):
            raise ValueError(
                "All features of preprocessing must be in columns_dict")

    model_features = extract_features_model(
        model, dict_model_feature[str(type(model))])
    if isinstance(model_features, list):
        if str(type(preprocessing)) in no_dummies_category_encoder:
            if set(columns_dict.values()) != set(model_features):
                raise ValueError(
                    "features of columns_dict and model must be the same")

        elif str(type(preprocessing)) in (no_dummies_sklearn,
                                          columntransformer):
            if len(set(columns_dict.values())) != len(set(model_features)):
                raise ValueError(
                    "length of features of columns_dict and model must be the same"
                )

        elif str(type(preprocessing)) not in (no_dummies_category_encoder, no_dummies_sklearn, columntransformer)\
                and preprocessing is not None:
            raise ValueError(
                "this type of encoder is not supported in SmartPredictor")
    else:
        model_length_features = model_features
        if len(set(columns_dict.values())) != model_length_features:
            raise ValueError(
                "features of columns_dict and model must have the same length")

    if postprocessing:
        if not isinstance(postprocessing, dict):
            raise ValueError("Postprocessing parameter must be a dictionnary")
        for feature in postprocessing.keys():
            if feature not in features_types.keys():
                raise ValueError(
                    "Postprocessing and features_types must have the same features names."
                )
            if feature not in columns_dict.values():
                raise ValueError(
                    "Postprocessing and columns_dict must have the same features names."
                )
        check_postprocessing(features_types, postprocessing)