Beispiel #1
0
def perform_global_dataframe_analysis(df: Optional[pd.DataFrame]) -> dict:
    """
    Returns a python dict containing global information about a pandas DataFrame :
    Number of features, Number of observations, missing values...

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe that will be used to compute global information.

    Returns
    -------
    global_d : dict
        dictionary that contains an ensemble of global information about the input dataframe.
    """
    if df is None:
        return dict()
    missing_values = df.isna().sum().sum()
    global_d = {
        'number of features': len(df.columns),
        'number of observations': df.shape[0],
        'missing values': missing_values,
        '% missing values': missing_values / (df.shape[0] * df.shape[1]),
    }

    for stat in global_d.keys():
        if stat == 'number of observations':
            global_d[stat] = int(global_d[stat])  # Keeping the exact number
        elif isinstance(global_d[stat], float):
            global_d[stat] = round_to_k(global_d[stat], 3)

    replace_dict_values(global_d, display_value, ',', '.')

    return global_d
Beispiel #2
0
def perform_univariate_dataframe_analysis(df: Optional[pd.DataFrame],
                                          col_types: dict) -> dict:
    """
    Returns a python dict containing information about each column of a pandas DataFrame.
    The computed information depends on the type of the column.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe on which the analysis will be performed
    col_types : dict
        Dict of types for each column

    Returns
    -------
    d : dict
        A dict containing each column as keys and the corresponding dict of information for each column as values.
    """
    if df is None:
        return dict()
    d = df.describe().to_dict()
    for col in df.columns:
        if col_types[col] == VarType.TYPE_CAT:
            d[col] = {
                'distinct values': df[col].nunique(),
                'missing values': df[col].isna().sum()
            }

    for col in d.keys():
        for stat in d[col].keys():
            if stat in ['count', 'distinct values']:
                d[col][stat] = int(
                    d[col][stat])  # Keeping the exact number here
            elif isinstance(d[col][stat], float):
                d[col][stat] = round_to_k(d[col][stat],
                                          3)  # Rounding to 3 important figures

    replace_dict_values(d, display_value, ',', '.')

    return d
Beispiel #3
0
    def display_model_performance(self):
        """
        Displays the performance of the model. The metrics are computed using the config dict.

        Metrics should be given as a list of dict. Each dict contains they following keys :
        'path' (path to the metric function, ex: 'sklearn.metrics.mean_absolute_error'),
        'name' (optional, name of the metric as displayed in the report),
        and 'use_proba_values' (optional, possible values are False (default) or True
        if the metric uses proba values instead of predicted values).

        For example :
        config['metrics'] = [
                {
                    'path': 'sklearn.metrics.mean_squared_error',
                    'name': 'Mean absolute error',  # Optional : name that will be displayed next to the metric
                    'y_pred': 'predicted_values'  # Optional
                },
                {
                    'path': 'Scoring_AP.utils.lift10',  # Custom function path
                    'name': 'Lift10',
                    'y_pred': 'proba_values'  # Use proba values instead of predicted values
                }
            ]
        """
        if self.y_test is None:
            logging.info(
                "No labels given for test set. Skipping model performance part"
            )
            return

        print_md("### Univariate analysis of target variable")
        df = pd.concat([
            pd.DataFrame({
                self.target_name: self.y_pred
            }).assign(_dataset="pred"),
            pd.DataFrame({
                self.target_name: self.y_test
            }).assign(_dataset="true") if self.y_test is not None else None
        ])
        self._perform_and_display_analysis_univariate(
            df=df,
            col_splitter="_dataset",
            split_values=["pred", "true"],
            names=["Prediction values", "True values"],
            group_id='target-distribution')

        if 'metrics' not in self.config.keys():
            logging.info(
                "No 'metrics' key found in report config dict. Skipping model performance part."
            )
            return
        print_md("### Metrics")

        for metric in self.config['metrics']:
            if 'name' not in metric.keys():
                metric['name'] = metric['path']

            if metric['path'] in ['confusion_matrix', 'sklearn.metrics.confusion_matrix'] or \
                    metric['name'] == 'confusion_matrix':
                print_md(f"**{metric['name']} :**")
                print_html(
                    convert_fig_to_html(
                        generate_confusion_matrix_plot(y_true=self.y_test,
                                                       y_pred=self.y_pred)))
            else:
                try:
                    metric_fn = get_callable(path=metric['path'])
                    #  Look if we should use proba values instead of predicted values
                    if 'use_proba_values' in metric.keys(
                    ) and metric['use_proba_values'] is True:
                        y_pred = self.explainer.proba_values
                    else:
                        y_pred = self.y_pred
                    res = metric_fn(self.y_test, y_pred)
                except Exception as e:
                    logging.info(
                        f"Could not compute following metric : {metric['path']}. \n{e}"
                    )
                    continue
                if isinstance(res, Number):
                    res = display_value(round_to_k(res, 3))
                    print_md(f"**{metric['name']} :** {res}")
                elif isinstance(res, (list, tuple, np.ndarray)):
                    print_md(f"**{metric['name']} :**")
                    print_html(
                        pd.DataFrame(res).to_html(classes="greyGridTable"))
                elif isinstance(res, str):
                    print_md(f"**{metric['name']} :**")
                    print_html(f"<pre>{res}</pre>")
                else:
                    logging.info(
                        f"Could not compute following metric : {metric['path']}. \n"
                        f"Result of type {res} cannot be displayed")
        print_md('---')
Beispiel #4
0
 def test_round_to_k_1(self):
     x = 123456789
     expected_r_x = 123000000
     assert round_to_k(x, 3) == expected_r_x
Beispiel #5
0
 def test_round_to_k_6(self):
     x = 0.0000123456789
     expected_r_x = 0.0000123
     assert round_to_k(x, 3) == expected_r_x
Beispiel #6
0
 def test_round_to_k_4(self):
     x = 123.456789
     expected_r_x = 123
     assert round_to_k(x, 3) == expected_r_x
Beispiel #7
0
 def test_round_to_k_3(self):
     x = 123456789
     expected_r_x = 100000000
     assert round_to_k(x, 1) == expected_r_x
Beispiel #8
0
    def __init__(self, explainer):
        """
        Init on class instantiation, everything to be able to run the app on server.
        Parameters
        ----------
        explainer : SmartExplainer
            SmartExplainer object
        """
        # APP
        self.server = Flask(__name__)
        self.app = dash.Dash(
            server=self.server,
            external_stylesheets=[dbc.themes.BOOTSTRAP],
        )
        self.app.title = 'Shapash Monitor'
        if explainer.title_story:
            self.app.title += ' - ' + explainer.title_story
        self.explainer = explainer

        # SETTINGS
        self.logo = self.app.get_asset_url('shapash-fond-fonce.png')
        self.color = '#f4c000'
        self.bkg_color = "#343736"
        self.settings_ini = {
            'rows': 1000,
            'points': 1000,
            'violin': 10,
            'features': 20,
        }
        self.settings = self.settings_ini.copy()
        self.predict_col = ['_predict_']
        self.explainer.features_imp = self.explainer.state.compute_features_import(
            self.explainer.contributions)
        if self.explainer._case == 'classification':
            self.label = self.explainer.check_label_name(
                len(self.explainer._classes) - 1, 'num')[1]
            self.selected_feature = self.explainer.features_imp[-1].idxmax()
            self.max_threshold = int(
                max([
                    x.applymap(lambda x: round_to_k(x, k=1)).max().max()
                    for x in self.explainer.contributions
                ]))
        else:
            self.label = None
            self.selected_feature = self.explainer.features_imp.idxmax()
            self.max_threshold = int(
                self.explainer.contributions.applymap(
                    lambda x: round_to_k(x, k=1)).max().max())
        self.list_index = []
        self.subset = None

        # DATA
        self.dataframe = pd.DataFrame()
        self.round_dataframe = pd.DataFrame()
        self.init_data()

        # COMPONENTS
        self.components = {
            'menu': {},
            'table': {},
            'graph': {},
            'filter': {},
            'settings': {}
        }
        self.init_components()

        # LAYOUT
        self.skeleton = {'navbar': {}, 'body': {}}
        self.make_skeleton()
        self.app.layout = html.Div(
            [self.skeleton['navbar'], self.skeleton['body']])

        # CALLBACK
        self.callback_fullscreen_buttons()
        self.init_callback_settings()
        self.callback_generator()