Ejemplo n.º 1
0
 def calculate(self, reference_data: pandas.DataFrame,
               production_data: pandas.DataFrame, _: Dict):
     self.wi = BaseWidgetInfo(
         type="counter",
         title=self.title,
         size=2,
         params={
             "counters": [{
                 "value": "7 out of 12 features",
                 "label": "Data Drift Detected"
             }]
         },
         alerts=[],
         insights=[],
         details="",
         alertsPosition="row",
         alertStats=AlertStats(),
         additionalGraphs=[],
     )
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            target_names = None

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            binaraizer = preprocessing.LabelBinarizer()
            binaraizer.fit(reference_data[target_column])
            binaraized_target = binaraizer.transform(
                reference_data[target_column])

            array_prediction = reference_data[prediction_column].to_numpy()

            prediction_ids = np.argmax(array_prediction, axis=-1)
            prediction_labels = [prediction_column[x] for x in prediction_ids]

            #plot support bar
            metrics_matrix = metrics.classification_report(
                reference_data[target_column],
                prediction_labels,
                output_dict=True)
            metrics_frame = pd.DataFrame(metrics_matrix)

            z = metrics_frame.iloc[:-1, :-3].values
            x = prediction_column
            y = ['precision', 'recall', 'f1-score']

            if len(prediction_column) > 2:
                roc_aucs = metrics.roc_auc_score(binaraized_target,
                                                 array_prediction,
                                                 average=None)
                z = np.append(z, [roc_aucs], axis=0)
                y.append('roc-auc')

            # change each element of z to type string for annotations
            z_text = [[str(round(y, 3)) for y in x] for x in z]

            # set up figure
            fig = ff.create_annotated_heatmap(z,
                                              y=y,
                                              x=x,
                                              annotation_text=z_text,
                                              colorscale='bluered',
                                              showscale=True)
            fig.update_layout(xaxis_title="Class", yaxis_title="Metric")

            metrics_matrix_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1 if production_data is not None else 2,
                params={
                    "data": metrics_matrix_json['data'],
                    "layout": metrics_matrix_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if target_column is not None:

            #calculate corr
            ref_target_corr = reference_data[
                num_feature_names + [target_column]].corr()[target_column]
            prod_target_corr = production_data[
                num_feature_names + [target_column]].corr()[target_column]

            #plot output correlations
            target_corr = go.Figure()

            target_corr.add_trace(
                go.Bar(y=ref_target_corr,
                       x=ref_target_corr.index,
                       marker_color=grey,
                       name='Reference'))

            target_corr.add_trace(
                go.Bar(y=prod_target_corr,
                       x=ref_target_corr.index,
                       marker_color=red,
                       name='Production'))

            target_corr.update_layout(xaxis_title="Features",
                                      yaxis_title="Correlation",
                                      yaxis=dict(range=(-1, 1),
                                                 showticklabels=True))

            target_corr_json = json.loads(target_corr.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={
                    "data": target_corr_json['data'],
                    "layout": target_corr_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
        reference_data.dropna(axis=0, how='any', inplace=True)

        if target_column is not None and prediction_column is not None:
            error = reference_data[prediction_column] - reference_data[target_column]

            quantile_5 = np.quantile(error, .05)
            quantile_95 = np.quantile(error, .95)

            mae = np.mean(error)
            mae_under = np.mean(error[error <= quantile_5])
            mae_exp = np.mean(error[(error > quantile_5) & (error < quantile_95)])
            mae_over = np.mean(error[error >= quantile_95])

            sd = np.std(error, ddof = 1)
            sd_under = np.std(error[error <= quantile_5], ddof = 1)
            sd_exp = np.std(error[(error > quantile_5) & (error < quantile_95)], ddof = 1)
            sd_over = np.std(error[error >= quantile_95], ddof = 1)
            
            self.wi = BaseWidgetInfo(
                title="Reference Data: Error Bias",
                type="counter",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={   
                    "counters": [
                      {
                        "value": str(round(mae, 2)) + " (" + str(round(sd,2)) + ")",
                        "label": "Overall"
                      },
                      {
                        "value": str(round(mae_exp, 2)) + " (" + str(round(sd_exp,2)) + ")",
                        "label": "Expected error"
                      },
                      {
                        "value": str(round(mae_under, 2)) + " (" + str(round(sd_under, 2)) + ")",
                        "label": "Underestimation"
                      },
                      {
                        "value": str(round(mae_over, 2)) + " (" + str(round(sd_over, 2)) + ")",
                        "label": "Overestimation"
                      }
                    ]
                },
                additionalGraphs=[]
            )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if prediction_column is not None:
            #plot drift
            reference_mean = np.mean(reference_data[prediction_column])
            reference_std = np.std(reference_data[prediction_column], ddof=1)
            x_title = "Timestamp" if date_column else "Index"

            pred_values = go.Figure()

            pred_values.add_trace(
                go.Scatter(x=reference_data[date_column]
                           if date_column else reference_data.index,
                           y=reference_data[prediction_column],
                           mode='markers',
                           name='Reference',
                           marker=dict(size=6, color=grey)))

            pred_values.add_trace(
                go.Scatter(x=production_data[date_column]
                           if date_column else production_data.index,
                           y=production_data[prediction_column],
                           mode='markers',
                           name='Current',
                           marker=dict(size=6, color=red)))

            pred_values.update_layout(
                xaxis_title=x_title,
                yaxis_title='Prediction Value',
                showlegend=True,
                legend=dict(orientation="h",
                            yanchor="bottom",
                            y=1.02,
                            xanchor="right",
                            x=1),
                shapes=[
                    dict(
                        type="rect",
                        # x-reference is assigned to the x-values
                        xref="paper",
                        # y-reference is assigned to the plot paper [0,1]
                        yref="y",
                        x0=0,
                        y0=reference_mean - reference_std,
                        x1=1,
                        y1=reference_mean + reference_std,
                        fillcolor="LightGreen",
                        opacity=0.5,
                        layer="below",
                        line_width=0,
                    ),
                    dict(
                        type="line",
                        name='Reference',
                        xref="paper",
                        yref="y",
                        x0=0,  #min(testset_agg_by_date.index),
                        y0=reference_mean,
                        x1=1,  #max(testset_agg_by_date.index),
                        y1=reference_mean,
                        line=dict(color="Green", width=3)),
                ])

            pred_values_json = json.loads(pred_values.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={
                    "data": pred_values_json['data'],
                    "layout": pred_values_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if production_data is not None:
            if target_column is not None and prediction_column is not None:
                production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
                production_data.dropna(axis=0, how='any', inplace=True)

                prod_error = production_data[prediction_column] - production_data[target_column]

                prod_quntile_5 = np.quantile(prod_error, .05)
                prod_quntile_95 = np.quantile(prod_error, .95)

                production_data['dataset'] = 'Reference'
                production_data['Error bias'] = list(map(lambda x : 'Underestimation' if x <= prod_quntile_5 else 'Majority' 
                                              if x < prod_quntile_95 else 'Overestimation', prod_error))
                
                #plot output correlations
                pred_actual = go.Figure()

                pred_actual.add_trace(go.Scatter(
                x = production_data[production_data['Error bias'] == 'Underestimation'][target_column],
                y = production_data[production_data['Error bias'] == 'Underestimation'][prediction_column],
                mode = 'markers',
                name = 'Underestimation',
                marker = dict(
                    color = '#6574f7',
                    showscale = False
                    )
                ))

                pred_actual.add_trace(go.Scatter(
                x = production_data[production_data['Error bias'] == 'Overestimation'][target_column],
                y = production_data[production_data['Error bias'] == 'Overestimation'][prediction_column],
                mode = 'markers',
                name = 'Overestimation',
                marker = dict(
                    color = '#ee5540',
                    showscale = False
                    )
                ))

                pred_actual.add_trace(go.Scatter(
                x = production_data[production_data['Error bias'] == 'Majority'][target_column],
                y = production_data[production_data['Error bias'] == 'Majority'][prediction_column],
                mode = 'markers',
                name = 'Majority',
                marker = dict(
                    color = '#1acc98',
                    showscale = False
                    )
                ))

                pred_actual.update_layout(
                    xaxis_title = "Actual value",
                    yaxis_title = "Predicted value",
                    xaxis = dict(
                        showticklabels=True
                    ),
                    yaxis = dict(
                        showticklabels=True
                    ),
                )

                pred_actual_json  = json.loads(pred_actual.to_json())

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1,
                    params={
                        "data": pred_actual_json['data'],
                        "layout": pred_actual_json['layout']
                    },
                    additionalGraphs=[],
                )
            else:
                self.wi = None
        else:
            self.wi = None
Ejemplo n.º 7
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if production_data is not None:
            if target_column is not None and prediction_column is not None:
                production_data.replace([np.inf, -np.inf],
                                        np.nan,
                                        inplace=True)
                production_data.dropna(axis=0, how='any', inplace=True)

                #plot output correlations
                error_distr = go.Figure()

                error = production_data[prediction_column] - production_data[
                    target_column]

                error_distr.add_trace(
                    go.Histogram(x=error,
                                 marker_color=red,
                                 name='error distribution',
                                 histnorm='percent'))

                error_distr.update_layout(
                    xaxis_title="Error (Predicted - Actual)",
                    yaxis_title="Percentage",
                )

                error_distr_json = json.loads(error_distr.to_json())

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1,
                    params={
                        "data": error_distr_json['data'],
                        "layout": error_distr_json['layout']
                    },
                    additionalGraphs=[],
                )
            else:
                self.wi = None
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if target_column is not None:
            #calculate output drift
            target_p_value = ks_2samp(reference_data[target_column],
                                      production_data[target_column])[1]
            target_sim_test = "detected" if target_p_value < 0.05 else "not detected"

            #plot output distributions
            target_distr = ff.create_distplot([
                reference_data[target_column], production_data[target_column]
            ], ["Reference", "Current"],
                                              colors=[grey, red],
                                              show_rug=True)

            target_distr.update_layout(xaxis_title="Value",
                                       yaxis_title="Share",
                                       legend=dict(orientation="h",
                                                   yanchor="bottom",
                                                   y=1.02,
                                                   xanchor="right",
                                                   x=1))

            target_drift_json = json.loads(target_distr.to_json())

            self.wi = BaseWidgetInfo(
                title="Target Drift: " + target_sim_test + ", p_value=" +
                str(round(target_p_value, 6)),
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "data": target_drift_json['data'],
                    "layout": target_drift_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Ejemplo n.º 9
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            target_names = None

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if prediction_column is not None and target_column is not None:
            binaraizer = preprocessing.LabelBinarizer()
            binaraizer.fit(reference_data[target_column])
            binaraized_target = binaraizer.transform(
                reference_data[target_column])
            if production_data is not None:
                ref_array_prediction = reference_data[
                    prediction_column].to_numpy()
                ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1)
                ref_prediction_labels = [
                    prediction_column[x] for x in ref_prediction_ids
                ]
                reference_data['prediction_labels'] = ref_prediction_labels

                prod_array_prediction = production_data[
                    prediction_column].to_numpy()
                prod_prediction_ids = np.argmax(prod_array_prediction, axis=-1)
                prod_prediction_labels = [
                    prediction_column[x] for x in prod_prediction_ids
                ]
                production_data['prediction_labels'] = prod_prediction_labels

                additional_graphs_data = []
                params_data = []

                for feature_name in num_feature_names + cat_feature_names:
                    #add data for table in params
                    labels = prediction_column

                    params_data.append({
                        "details": {
                            "parts": [{
                                "title": "All",
                                "id": "All"
                            }] + [{
                                "title": str(label),
                                "id": feature_name + "_" + str(label)
                            } for label in labels],
                            "insights": []
                        },
                        "f1": feature_name
                    })

                    #create confusion based plots
                    reference_data['dataset'] = 'Reference'
                    production_data['dataset'] = 'Production'
                    merged_data = pd.concat([reference_data, production_data])

                    fig = px.histogram(merged_data,
                                       x=feature_name,
                                       color=target_column,
                                       facet_col="dataset",
                                       histnorm='',
                                       category_orders={
                                           "dataset":
                                           ["Reference", "Production"]
                                       })

                    fig_json = json.loads(fig.to_json())

                    #write plot data in table as additional data
                    additional_graphs_data.append(
                        AdditionalGraphInfo(
                            "All",
                            {
                                "data": fig_json['data'],
                                "layout": fig_json['layout']
                            },
                        ))

                    for label in labels:
                        merged_data['Confusion'] = merged_data.apply(lambda x : 'TP' if (x['target'] == label and x['prediction_labels'] == label)
                                                 else ('FP' if(x['target'] != label and x['prediction_labels'] == label) else \
                                                       ('FN' if (x['target'] == label and x['prediction_labels'] != label) else 'TN')), axis = 1)

                        fig = px.histogram(merged_data,
                                           x=feature_name,
                                           color='Confusion',
                                           facet_col="dataset",
                                           histnorm='',
                                           category_orders={
                                               "dataset":
                                               ["Reference", "Production"],
                                               "Confusion":
                                               ["TP", "TN", "FP", "FN"]
                                           })

                        fig_json = json.loads(fig.to_json())

                        #write plot data in table as additional data
                        additional_graphs_data.append(
                            AdditionalGraphInfo(
                                feature_name + "_" + str(label),
                                {
                                    "data": fig_json['data'],
                                    "layout": fig_json['layout']
                                },
                            ))

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_table",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=2,
                    params={
                        "rowsPerPage":
                        min(
                            len(num_feature_names) + len(cat_feature_names),
                            10),
                        "columns": [{
                            "title": "Feature",
                            "field": "f1"
                        }],
                        "data":
                        params_data
                    },
                    additionalGraphs=additional_graphs_data)

            else:
                ref_array_prediction = reference_data[
                    prediction_column].to_numpy()
                ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1)
                ref_prediction_labels = [
                    prediction_column[x] for x in ref_prediction_ids
                ]
                reference_data['prediction_labels'] = ref_prediction_labels

                additional_graphs_data = []
                params_data = []

                for feature_name in num_feature_names + cat_feature_names:
                    #add data for table in params
                    labels = prediction_column

                    params_data.append({
                        "details": {
                            "parts": [{
                                "title": "All",
                                "id": "All"
                            }] + [{
                                "title": str(label),
                                "id": feature_name + "_" + str(label)
                            } for label in labels],
                            "insights": []
                        },
                        "f1": feature_name
                    })

                    #create confusion based plots
                    fig = px.histogram(reference_data,
                                       x=feature_name,
                                       color=target_column,
                                       histnorm='')

                    fig_json = json.loads(fig.to_json())

                    #write plot data in table as additional data
                    additional_graphs_data.append(
                        AdditionalGraphInfo(
                            "All",
                            {
                                "data": fig_json['data'],
                                "layout": fig_json['layout']
                            },
                        ))

                    for label in labels:
                        reference_data['Confusion'] = reference_data.apply(lambda x : 'TP' if (x['target'] == label and x['prediction_labels'] == label)
                                                 else ('FP' if(x['target'] != label and x['prediction_labels'] == label) else \
                                                       ('FN' if (x['target'] == label and x['prediction_labels'] != label) else 'TN')), axis = 1)

                        fig = px.histogram(reference_data,
                                           x=feature_name,
                                           color='Confusion',
                                           histnorm='',
                                           category_orders={
                                               "Confusion":
                                               ["TP", "TN", "FP", "FN"]
                                           })

                        fig_json = json.loads(fig.to_json())

                        #write plot data in table as additional data
                        additional_graphs_data.append(
                            AdditionalGraphInfo(
                                feature_name + "_" + str(label),
                                {
                                    "data": fig_json['data'],
                                    "layout": fig_json['layout']
                                },
                            ))

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_table",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=2,
                    params={
                        "rowsPerPage":
                        min(
                            len(num_feature_names) + len(cat_feature_names),
                            10),
                        "columns": [{
                            "title": "Feature",
                            "field": "f1"
                        }],
                        "data":
                        params_data
                    },
                    additionalGraphs=additional_graphs_data)
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if prediction_column is not None and target_column is not None:           
            additional_graphs_data = []
            params_data = []
            for feature_name in num_feature_names + cat_feature_names: 
                #add data for table in params
                params_data.append(
                    {
                        "details": {
                                "parts": [
                                    {
                                        "title": "Target",
                                        "id": feature_name + "_target_values"
                                    },
                                    {
                                        "title": "Prediction",
                                        "id": feature_name + "_prediction_values"
                                    }
                                ],
                                "insights": []
                            },
                            "f1": feature_name
                    }
                    )

                #create target plot
                reference_data['dataset'] = 'Reference'
                production_data['dataset'] = 'Production'
                merged_data = pd.concat([reference_data, production_data])

                target_fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset",
                    category_orders={"dataset": ["Reference", "Production"]})

                target_fig_json  = json.loads(target_fig.to_json())

                #create prediction plot
                pred_fig = px.histogram(merged_data, x=feature_name, color=prediction_column, facet_col="dataset",
                    category_orders={"dataset": ["Reference", "Production"]})

                pred_fig_json  = json.loads(pred_fig.to_json())

                #write plot data in table as additional data
                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_target_values',
                        {
                            "data" : target_fig_json['data'],
                            "layout" : target_fig_json['layout']
                        }, 
                    )
                )

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_prediction_values',
                        {
                            "data" : pred_fig_json['data'],
                            "layout" : pred_fig_json['layout']
                        }, 
                    )
                )

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_table",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10),
                    "columns": [
                        {
                            "title": "Feature",
                            "field": "f1"
                        }
                    ],
                    "data": params_data
                },
                additionalGraphs=additional_graphs_data
            )

        elif target_column is not None:
            additional_graphs_data = []
            params_data = []
            for feature_name in num_feature_names + cat_feature_names: 
                #add data for table in params
                params_data.append(
                    {
                        "details": {
                                "parts": [
                                    {
                                        "title": "Target",
                                        "id": feature_name + "_target_values"
                                    }
                                ],
                                "insights": []
                            },
                            "f1": feature_name
                    }
                    )

                #create target plot
                reference_data['dataset'] = 'Reference'
                production_data['dataset'] = 'Production'
                merged_data = pd.concat([reference_data, production_data])

                target_fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset",
                    category_orders={"dataset": ["Reference", "Production"]})

                target_fig_json  = json.loads(target_fig.to_json())

                #write plot data in table as additional data
                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_target_values',
                        {
                            "data" : target_fig_json['data'],
                            "layout" : target_fig_json['layout']
                        }, 
                    )
                )

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_table",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10),
                    "columns": [
                        {
                            "title": "Feature",
                            "field": "f1"
                        }
                    ],
                    "data": params_data
                },
                additionalGraphs=additional_graphs_data
            )
        elif prediction_column is not None:
            additional_graphs_data = []
            params_data = []
            for feature_name in num_feature_names + cat_feature_names: 
                #add data for table in params
                params_data.append(
                    {
                        "details": {
                                "parts": [
                                    {
                                        "title": "Prediction",
                                        "id": feature_name + "_prediction_values"
                                    }
                                ],
                                "insights": []
                            },
                            "f1": feature_name
                    }
                    )

                #create target plot
                reference_data['dataset'] = 'Reference'
                production_data['dataset'] = 'Production'
                merged_data = pd.concat([reference_data, production_data])

                prediction_fig = px.histogram(merged_data, x=feature_name, color=prediction_column, facet_col="dataset",
                    category_orders={"dataset": ["Reference", "Production"]})

                prediction_fig_json  = json.loads(prediction_fig.to_json())

                #write plot data in table as additional data
                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_prediction_values',
                        {
                            "data" : prediction_fig_json['data'],
                            "layout" : prediction_fig_json['layout']
                        }, 
                    )
                )

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_table",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10),
                    "columns": [
                        {
                            "title": "Feature",
                            "field": "f1"
                        }
                    ],
                    "data": params_data
                },
                additionalGraphs=additional_graphs_data
            )            

        else:
            self.wi = None
Ejemplo n.º 11
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            #plot support bar
            metrics_matrix = metrics.classification_report(
                production_data[target_column],
                production_data[prediction_column],
                output_dict=True)
            metrics_frame = pd.DataFrame(metrics_matrix)
            support = metrics_frame.iloc[-1:, :-3].values[0]

            fig = go.Figure()

            fig.add_trace(
                go.Bar(x=target_names if target_names else
                       metrics_frame.columns.tolist()[:-3],
                       y=metrics_frame.iloc[-1:, :-3].values[0],
                       marker_color=red,
                       name='Support'))

            fig.update_layout(
                xaxis_title="Class",
                yaxis_title="Number of Objects",
            )

            support_bar_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={
                    "data": support_bar_json['data'],
                    "layout": support_bar_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Ejemplo n.º 12
0
 def calculate(self, reference_data: pandas.DataFrame,
               production_data: pandas.DataFrame, _: Dict):
     self.wi = BaseWidgetInfo(
         type="big_graph",
         title=self.title,
         size=2,
         params={
             "data": [{
                 "marker": {
                     "color": "#ed0400"
                 },
                 "type": "bar",
                 "x": reference_data[0].tolist(),
                 "y": reference_data[1].tolist()
             }],
             "layout": {
                 "template": {
                     "data": {
                         "bar": [{
                             "error_x": {
                                 "color": "#2a3f5f"
                             },
                             "error_y": {
                                 "color": "#2a3f5f"
                             },
                             "marker": {
                                 "line": {
                                     "color": "#E5ECF6",
                                     "width": 0.5
                                 }
                             },
                             "type": "bar"
                         }],
                         "barpolar": [{
                             "marker": {
                                 "line": {
                                     "color": "#E5ECF6",
                                     "width": 0.5
                                 }
                             },
                             "type": "barpolar"
                         }],
                         "carpet": [{
                             "aaxis": {
                                 "endlinecolor": "#2a3f5f",
                                 "gridcolor": "white",
                                 "linecolor": "white",
                                 "minorgridcolor": "white",
                                 "startlinecolor": "#2a3f5f"
                             },
                             "baxis": {
                                 "endlinecolor": "#2a3f5f",
                                 "gridcolor": "white",
                                 "linecolor": "white",
                                 "minorgridcolor": "white",
                                 "startlinecolor": "#2a3f5f"
                             },
                             "type": "carpet"
                         }],
                         "choropleth": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "type": "choropleth"
                         }],
                         "contour": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "colorscale": [[0.0, "#0d0887"],
                                            [0.1111111111111111, "#46039f"],
                                            [0.2222222222222222, "#7201a8"],
                                            [0.3333333333333333, "#9c179e"],
                                            [0.4444444444444444, "#bd3786"],
                                            [0.5555555555555556, "#d8576b"],
                                            [0.6666666666666666, "#ed7953"],
                                            [0.7777777777777778, "#fb9f3a"],
                                            [0.8888888888888888, "#fdca26"],
                                            [1.0, "#f0f921"]],
                             "type":
                             "contour"
                         }],
                         "contourcarpet": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "type": "contourcarpet"
                         }],
                         "heatmap": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "colorscale": [[0.0, "#0d0887"],
                                            [0.1111111111111111, "#46039f"],
                                            [0.2222222222222222, "#7201a8"],
                                            [0.3333333333333333, "#9c179e"],
                                            [0.4444444444444444, "#bd3786"],
                                            [0.5555555555555556, "#d8576b"],
                                            [0.6666666666666666, "#ed7953"],
                                            [0.7777777777777778, "#fb9f3a"],
                                            [0.8888888888888888, "#fdca26"],
                                            [1.0, "#f0f921"]],
                             "type":
                             "heatmap"
                         }],
                         "heatmapgl": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "colorscale": [[0.0, "#0d0887"],
                                            [0.1111111111111111, "#46039f"],
                                            [0.2222222222222222, "#7201a8"],
                                            [0.3333333333333333, "#9c179e"],
                                            [0.4444444444444444, "#bd3786"],
                                            [0.5555555555555556, "#d8576b"],
                                            [0.6666666666666666, "#ed7953"],
                                            [0.7777777777777778, "#fb9f3a"],
                                            [0.8888888888888888, "#fdca26"],
                                            [1.0, "#f0f921"]],
                             "type":
                             "heatmapgl"
                         }],
                         "histogram": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "histogram"
                         }],
                         "histogram2d": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "colorscale": [[0.0, "#0d0887"],
                                            [0.1111111111111111, "#46039f"],
                                            [0.2222222222222222, "#7201a8"],
                                            [0.3333333333333333, "#9c179e"],
                                            [0.4444444444444444, "#bd3786"],
                                            [0.5555555555555556, "#d8576b"],
                                            [0.6666666666666666, "#ed7953"],
                                            [0.7777777777777778, "#fb9f3a"],
                                            [0.8888888888888888, "#fdca26"],
                                            [1.0, "#f0f921"]],
                             "type":
                             "histogram2d"
                         }],
                         "histogram2dcontour": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "colorscale": [[0.0, "#0d0887"],
                                            [0.1111111111111111, "#46039f"],
                                            [0.2222222222222222, "#7201a8"],
                                            [0.3333333333333333, "#9c179e"],
                                            [0.4444444444444444, "#bd3786"],
                                            [0.5555555555555556, "#d8576b"],
                                            [0.6666666666666666, "#ed7953"],
                                            [0.7777777777777778, "#fb9f3a"],
                                            [0.8888888888888888, "#fdca26"],
                                            [1.0, "#f0f921"]],
                             "type":
                             "histogram2dcontour"
                         }],
                         "mesh3d": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "type": "mesh3d"
                         }],
                         "parcoords": [{
                             "line": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "parcoords"
                         }],
                         "pie": [{
                             "automargin": True,
                             "type": "pie"
                         }],
                         "scatter": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scatter"
                         }],
                         "scatter3d": [{
                             "line": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scatter3d"
                         }],
                         "scattercarpet": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scattercarpet"
                         }],
                         "scattergeo": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scattergeo"
                         }],
                         "scattergl": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scattergl"
                         }],
                         "scattermapbox": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scattermapbox"
                         }],
                         "scatterpolar": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scatterpolar"
                         }],
                         "scatterpolargl": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scatterpolargl"
                         }],
                         "scatterternary": [{
                             "marker": {
                                 "colorbar": {
                                     "outlinewidth": 0,
                                     "ticks": ""
                                 }
                             },
                             "type": "scatterternary"
                         }],
                         "surface": [{
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             },
                             "colorscale": [[0.0, "#0d0887"],
                                            [0.1111111111111111, "#46039f"],
                                            [0.2222222222222222, "#7201a8"],
                                            [0.3333333333333333, "#9c179e"],
                                            [0.4444444444444444, "#bd3786"],
                                            [0.5555555555555556, "#d8576b"],
                                            [0.6666666666666666, "#ed7953"],
                                            [0.7777777777777778, "#fb9f3a"],
                                            [0.8888888888888888, "#fdca26"],
                                            [1.0, "#f0f921"]],
                             "type":
                             "surface"
                         }],
                         "table": [{
                             "cells": {
                                 "fill": {
                                     "color": "#EBF0F8"
                                 },
                                 "line": {
                                     "color": "white"
                                 }
                             },
                             "header": {
                                 "fill": {
                                     "color": "#C8D4E3"
                                 },
                                 "line": {
                                     "color": "white"
                                 }
                             },
                             "type": "table"
                         }]
                     },
                     "layout": {
                         "annotationdefaults": {
                             "arrowcolor": "#2a3f5f",
                             "arrowhead": 0,
                             "arrowwidth": 1
                         },
                         "coloraxis": {
                             "colorbar": {
                                 "outlinewidth": 0,
                                 "ticks": ""
                             }
                         },
                         "colorscale": {
                             "diverging":
                             [[0, "#8e0152"], [0.1, "#c51b7d"],
                              [0.2, "#de77ae"], [0.3, "#f1b6da"],
                              [0.4, "#fde0ef"], [0.5, "#f7f7f7"],
                              [0.6, "#e6f5d0"], [0.7, "#b8e186"],
                              [0.8, "#7fbc41"], [0.9, "#4d9221"],
                              [1, "#276419"]],
                             "sequential": [[0.0, "#0d0887"],
                                            [0.1111111111111111, "#46039f"],
                                            [0.2222222222222222, "#7201a8"],
                                            [0.3333333333333333, "#9c179e"],
                                            [0.4444444444444444, "#bd3786"],
                                            [0.5555555555555556, "#d8576b"],
                                            [0.6666666666666666, "#ed7953"],
                                            [0.7777777777777778, "#fb9f3a"],
                                            [0.8888888888888888, "#fdca26"],
                                            [1.0, "#f0f921"]],
                             "sequentialminus":
                             [[0.0, "#0d0887"],
                              [0.1111111111111111, "#46039f"],
                              [0.2222222222222222, "#7201a8"],
                              [0.3333333333333333, "#9c179e"],
                              [0.4444444444444444, "#bd3786"],
                              [0.5555555555555556, "#d8576b"],
                              [0.6666666666666666, "#ed7953"],
                              [0.7777777777777778, "#fb9f3a"],
                              [0.8888888888888888, "#fdca26"],
                              [1.0, "#f0f921"]]
                         },
                         "colorway": [
                             "#636efa", "#EF553B", "#00cc96", "#ab63fa",
                             "#FFA15A", "#19d3f3", "#FF6692", "#B6E880",
                             "#FF97FF", "#FECB52"
                         ],
                         "font": {
                             "color": "#2a3f5f"
                         },
                         "geo": {
                             "bgcolor": "white",
                             "lakecolor": "white",
                             "landcolor": "#E5ECF6",
                             "showlakes": True,
                             "showland": True,
                             "subunitcolor": "white"
                         },
                         "hoverlabel": {
                             "align": "left"
                         },
                         "hovermode":
                         "closest",
                         "mapbox": {
                             "style": "light"
                         },
                         "paper_bgcolor":
                         "white",
                         "plot_bgcolor":
                         "#E5ECF6",
                         "polar": {
                             "angularaxis": {
                                 "gridcolor": "white",
                                 "linecolor": "white",
                                 "ticks": ""
                             },
                             "bgcolor": "#E5ECF6",
                             "radialaxis": {
                                 "gridcolor": "white",
                                 "linecolor": "white",
                                 "ticks": ""
                             }
                         },
                         "scene": {
                             "xaxis": {
                                 "backgroundcolor": "#E5ECF6",
                                 "gridcolor": "white",
                                 "gridwidth": 2,
                                 "linecolor": "white",
                                 "showbackground": True,
                                 "ticks": "",
                                 "zerolinecolor": "white"
                             },
                             "yaxis": {
                                 "backgroundcolor": "#E5ECF6",
                                 "gridcolor": "white",
                                 "gridwidth": 2,
                                 "linecolor": "white",
                                 "showbackground": True,
                                 "ticks": "",
                                 "zerolinecolor": "white"
                             },
                             "zaxis": {
                                 "backgroundcolor": "#E5ECF6",
                                 "gridcolor": "white",
                                 "gridwidth": 2,
                                 "linecolor": "white",
                                 "showbackground": True,
                                 "ticks": "",
                                 "zerolinecolor": "white"
                             }
                         },
                         "shapedefaults": {
                             "line": {
                                 "color": "#2a3f5f"
                             }
                         },
                         "ternary": {
                             "aaxis": {
                                 "gridcolor": "white",
                                 "linecolor": "white",
                                 "ticks": ""
                             },
                             "baxis": {
                                 "gridcolor": "white",
                                 "linecolor": "white",
                                 "ticks": ""
                             },
                             "bgcolor": "#E5ECF6",
                             "caxis": {
                                 "gridcolor": "white",
                                 "linecolor": "white",
                                 "ticks": ""
                             }
                         },
                         "title": {
                             "x": 0.05
                         },
                         "xaxis": {
                             "automargin": True,
                             "gridcolor": "white",
                             "linecolor": "white",
                             "ticks": "",
                             "title": {
                                 "standoff": 15
                             },
                             "zerolinecolor": "white",
                             "zerolinewidth": 2
                         },
                         "yaxis": {
                             "automargin": True,
                             "gridcolor": "white",
                             "linecolor": "white",
                             "ticks": "",
                             "title": {
                                 "standoff": 15
                             },
                             "zerolinecolor": "white",
                             "zerolinewidth": 2
                         }
                     }
                 },
                 "xaxis": {
                     "title": {
                         "text": "Features"
                     }
                 },
                 "yaxis": {
                     "range": [-1, 1],
                     "showticklabels": True,
                     "title": {
                         "text": "Correlation"
                     }
                 }
             }
         },
         alerts=[],
         insights=[],
         details="",
         alertsPosition="row",
         alertStats=AlertStats(),
         additionalGraphs=[],
     )
Ejemplo n.º 13
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            #target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            array_prediction = production_data[prediction_column].to_numpy()

            prediction_ids = np.argmax(array_prediction, axis=-1)
            prediction_labels = [prediction_column[x] for x in prediction_ids]

            #plot support bar
            graphs = []

            for label in prediction_column:
                fig = go.Figure()

                fig.add_trace(
                    go.Scatter(x=np.random.random(production_data[
                        production_data[target_column] == label].shape[0]),
                               y=production_data[production_data[target_column]
                                                 == label][label],
                               mode='markers',
                               name=str(label),
                               marker=dict(size=6, color=red)))

                fig.add_trace(
                    go.Scatter(
                        x=np.random.random(production_data[
                            production_data[target_column] != label].shape[0]),
                        y=production_data[
                            production_data[target_column] != label][label],
                        mode='markers',
                        name='other',
                        marker=dict(size=6, color=grey)))

                fig.update_layout(yaxis_title="Probability",
                                  xaxis=dict(range=(-2, 3),
                                             showticklabels=False))

                fig_json = json.loads(fig.to_json())

                graphs.append({
                    "id": "tab_" + str(label),
                    "title": str(label),
                    "graph": {
                        "data": fig_json["data"],
                        "layout": fig_json["layout"],
                    }
                })

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="tabbed_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={"graphs": graphs},
                additionalGraphs=[],
            )
        else:
            self.wi = None
Ejemplo n.º 14
0
 def get_info(self) -> BaseWidgetInfo:
     return BaseWidgetInfo(
         title=self.title,
         type="big_table",
         details="",
         alertStats=AlertStats(),
         alerts=[],
         alertsPosition="row",
         insights=[],
         size=2,
         params={
             "columns": [{
                 "title": "Feature",
                 "field": "f1"
             }, {
                 "title": "Data drift",
                 "field": "f2"
             }, {
                 "title": "Distribution",
                 "field": "f3",
                 "type": "histogram",
                 "options": {
                     "xField": "x",
                     "yField": "y"
                 }
             }, {
                 "title":
                 "Distribution shift (similarity test at 95% confidence level)",
                 "field": "f4"
             }, {
                 "title": "Alerts",
                 "field": "f5"
             }],
             "data": [{
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "season_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "season_distr"
                     }],
                     "insights": []
                 },
                 "f1": "season",
                 "f2": "Detected",
                 "f3": {
                     "x":
                     [0.0, 0.0, 0.0, 0.0, 0.0, 1000.0, 0.0, 0.0, 0.0, 0.0],
                     "y": [
                         3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4,
                         4.5
                     ]
                 },
                 "f4": "Rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "holiday_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "holiday_distr"
                     }],
                     "insights": []
                 },
                 "f1": "holiday",
                 "f2": "Not Detected",
                 "f3": {
                     "x":
                     [976.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 24.0],
                     "y": [
                         0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5,
                         0.6000000000000001, 0.7000000000000001, 0.8, 0.9,
                         1.0
                     ]
                 },
                 "f4": "Not rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "workingday_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "workingday_distr"
                     }],
                     "insights": []
                 },
                 "f1": "workingday",
                 "f2": "Not Detected",
                 "f3": {
                     "x":
                     [312.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 688.0],
                     "y": [
                         0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5,
                         0.6000000000000001, 0.7000000000000001, 0.8, 0.9,
                         1.0
                     ]
                 },
                 "f4": "Not rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "weather_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "weather_distr"
                     }],
                     "insights": []
                 },
                 "f1": "weather",
                 "f2": "Detected",
                 "f3": {
                     "x": [
                         566.0, 0.0, 0.0, 0.0, 0.0, 382.0, 0.0, 0.0, 0.0,
                         52.0
                     ],
                     "y": [
                         1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2,
                         2.4000000000000004, 2.6, 2.8, 3.0
                     ]
                 },
                 "f4": "Rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "temp_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "temp_distr"
                     }],
                     "insights": []
                 },
                 "f1": "temp",
                 "f2": "Detected",
                 "f3": {
                     "x": [
                         7.0, 55.0, 197.0, 182.0, 307.0, 93.0, 73.0, 46.0,
                         32.0, 8.0
                     ],
                     "y": [
                         6.56, 8.61, 10.66, 12.709999999999999,
                         14.759999999999998, 16.81, 18.86,
                         20.909999999999997, 22.959999999999997,
                         25.009999999999998, 27.06
                     ]
                 },
                 "f4": "Rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "atemp_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "atemp_distr"
                     }],
                     "insights": []
                 },
                 "f1": "atemp",
                 "f2": "Detected",
                 "f3": {
                     "x": [
                         12.0, 84.0, 193.0, 237.0, 132.0, 183.0, 73.0, 61.0,
                         8.0, 17.0
                     ],
                     "y": [
                         9.09, 11.286999999999999, 13.484,
                         15.681000000000001, 17.878, 20.075, 22.272, 24.469,
                         26.666, 28.863, 31.06
                     ]
                 },
                 "f4": "Rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "humidity_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "humidity_distr"
                     }],
                     "insights": []
                 },
                 "f1": "humidity",
                 "f2": "Detected",
                 "f3": {
                     "x": [
                         7.0, 5.0, 59.0, 144.0, 188.0, 180.0, 80.0, 149.0,
                         105.0, 83.0
                     ],
                     "y": [
                         16.0, 24.4, 32.8, 41.2, 49.6, 58.0, 66.4,
                         74.80000000000001, 83.2, 91.60000000000001, 100.0
                     ]
                 },
                 "f4": "Rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "windspeed_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "windspeed_distr"
                     }],
                     "insights": []
                 },
                 "f1": "windspeed",
                 "f2": "Not Detected",
                 "f3": {
                     "x": [
                         117.0, 193.0, 201.0, 271.0, 112.0, 57.0, 39.0, 9.0,
                         0.0, 1.0
                     ],
                     "y": [
                         0.0, 4.30006, 8.60012, 12.90018, 17.20024,
                         21.500300000000003, 25.80036, 30.10042, 34.40048,
                         38.700540000000004, 43.0006
                     ]
                 },
                 "f4": "Not rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "month_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "month_distr"
                     }],
                     "insights": []
                 },
                 "f1": "month",
                 "f2": "Detected",
                 "f3": {
                     "x": [
                         89.0, 0.0, 0.0, 0.0, 0.0, 455.0, 0.0, 0.0, 0.0,
                         456.0
                     ],
                     "y": [
                         10.0, 10.2, 10.4, 10.6, 10.8, 11.0, 11.2, 11.4,
                         11.6, 11.8, 12.0
                     ]
                 },
                 "f4": "Rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "hour_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "hour_distr"
                     }],
                     "insights": []
                 },
                 "f1": "hour",
                 "f2": "Not Detected",
                 "f3": {
                     "x": [
                         123.0, 81.0, 82.0, 126.0, 84.0, 84.0, 126.0, 84.0,
                         84.0, 126.0
                     ],
                     "y": [
                         0.0, 2.3, 4.6, 6.8999999999999995, 9.2, 11.5,
                         13.799999999999999, 16.099999999999998, 18.4, 20.7,
                         23.0
                     ]
                 },
                 "f4": "Not rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "year_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "year_distr"
                     }],
                     "insights": []
                 },
                 "f1": "year",
                 "f2": "Detected",
                 "f3": {
                     "x":
                     [0.0, 0.0, 0.0, 0.0, 0.0, 1000.0, 0.0, 0.0, 0.0, 0.0],
                     "y": [
                         2011.5, 2011.6, 2011.7, 2011.8, 2011.9, 2012.0,
                         2012.1, 2012.2, 2012.3, 2012.4, 2012.5
                     ]
                 },
                 "f4": "Rejected",
                 "f5": " "
             }, {
                 "details": {
                     "parts": [{
                         "title": "Data drift",
                         "id": "week_day_drift"
                     }, {
                         "title": "Data distribution",
                         "id": "week_day_distr"
                     }],
                     "insights": []
                 },
                 "f1": "week_day",
                 "f2": "Not Detected",
                 "f3": {
                     "x": [
                         144.0, 137.0, 0.0, 144.0, 0.0, 143.0, 144.0, 0.0,
                         144.0, 144.0
                     ],
                     "y": [
                         1.0, 1.6, 2.2, 2.8, 3.4, 4.0, 4.6, 5.2, 5.8,
                         6.3999999999999995, 7.0
                     ]
                 },
                 "f4": "Not rejected",
                 "f5": " "
             }]
         },
         additionalGraphs=[
             AdditionalGraphInfo(
                 "holiday_drift", {
                     "data": [{
                         "marker": {
                             "color": "#4d4d4d",
                             "size": 6
                         },
                         "mode":
                         "markers",
                         "name":
                         "Production",
                         "type":
                         "scatter",
                         "x": [
                             "2012-10-16", "2012-10-17", "2012-10-18",
                             "2012-10-19", "2012-11-01", "2012-11-02",
                             "2012-11-03", "2012-11-04", "2012-11-05",
                             "2012-11-06", "2012-11-07", "2012-11-08",
                             "2012-11-09", "2012-11-10", "2012-11-11",
                             "2012-11-12", "2012-11-13", "2012-11-14",
                             "2012-11-15", "2012-11-16", "2012-11-17",
                             "2012-11-18", "2012-11-19", "2012-12-01",
                             "2012-12-02", "2012-12-03", "2012-12-04",
                             "2012-12-05", "2012-12-06", "2012-12-07",
                             "2012-12-08", "2012-12-09", "2012-12-10",
                             "2012-12-11", "2012-12-12", "2012-12-13",
                             "2012-12-14", "2012-12-15", "2012-12-16",
                             "2012-12-17", "2012-12-18", "2012-12-19"
                         ],
                         "y": [
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                         ]
                     }],
                     "layout": {
                         "legend": {
                             "orientation": "h",
                             "x": 1,
                             "xanchor": "right",
                             "y": 1.02,
                             "yanchor": "bottom"
                         },
                         "shapes": [{
                             "fillcolor": "LightGreen",
                             "layer": "below",
                             "line": {
                                 "width": 0
                             },
                             "opacity": 0.5,
                             "type": "rect",
                             "x0": 0,
                             "x1": 1,
                             "xref": "paper",
                             "y0": -0.13886233657597655,
                             "y1": 0.19692424230124458,
                             "yref": "y"
                         }, {
                             "line": {
                                 "color": "Green",
                                 "width": 3
                             },
                             "name": "Reference",
                             "type": "line",
                             "x0": 0,
                             "x1": 1,
                             "xref": "paper",
                             "y0": 0.02903095286263403,
                             "y1": 0.02903095286263403,
                             "yref": "y"
                         }],
                         "showlegend":
                         True,
                         "template": {
                             "data": {
                                 "bar": [{
                                     "error_x": {
                                         "color": "#2a3f5f"
                                     },
                                     "error_y": {
                                         "color": "#2a3f5f"
                                     },
                                     "marker": {
                                         "line": {
                                             "color": "#E5ECF6",
                                             "width": 0.5
                                         }
                                     },
                                     "type": "bar"
                                 }],
                                 "barpolar": [{
                                     "marker": {
                                         "line": {
                                             "color": "#E5ECF6",
                                             "width": 0.5
                                         }
                                     },
                                     "type": "barpolar"
                                 }],
                                 "carpet": [{
                                     "aaxis": {
                                         "endlinecolor": "#2a3f5f",
                                         "gridcolor": "white",
                                         "linecolor": "white",
                                         "minorgridcolor": "white",
                                         "startlinecolor": "#2a3f5f"
                                     },
                                     "baxis": {
                                         "endlinecolor": "#2a3f5f",
                                         "gridcolor": "white",
                                         "linecolor": "white",
                                         "minorgridcolor": "white",
                                         "startlinecolor": "#2a3f5f"
                                     },
                                     "type": "carpet"
                                 }],
                                 "choropleth": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "type": "choropleth"
                                 }],
                                 "contour": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "colorscale":
                                     [[0.0, "#0d0887"],
                                      [0.1111111111111111, "#46039f"],
                                      [0.2222222222222222, "#7201a8"],
                                      [0.3333333333333333, "#9c179e"],
                                      [0.4444444444444444, "#bd3786"],
                                      [0.5555555555555556, "#d8576b"],
                                      [0.6666666666666666, "#ed7953"],
                                      [0.7777777777777778, "#fb9f3a"],
                                      [0.8888888888888888, "#fdca26"],
                                      [1.0, "#f0f921"]],
                                     "type":
                                     "contour"
                                 }],
                                 "contourcarpet": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "type": "contourcarpet"
                                 }],
                                 "heatmap": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "colorscale":
                                     [[0.0, "#0d0887"],
                                      [0.1111111111111111, "#46039f"],
                                      [0.2222222222222222, "#7201a8"],
                                      [0.3333333333333333, "#9c179e"],
                                      [0.4444444444444444, "#bd3786"],
                                      [0.5555555555555556, "#d8576b"],
                                      [0.6666666666666666, "#ed7953"],
                                      [0.7777777777777778, "#fb9f3a"],
                                      [0.8888888888888888, "#fdca26"],
                                      [1.0, "#f0f921"]],
                                     "type":
                                     "heatmap"
                                 }],
                                 "heatmapgl": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "colorscale":
                                     [[0.0, "#0d0887"],
                                      [0.1111111111111111, "#46039f"],
                                      [0.2222222222222222, "#7201a8"],
                                      [0.3333333333333333, "#9c179e"],
                                      [0.4444444444444444, "#bd3786"],
                                      [0.5555555555555556, "#d8576b"],
                                      [0.6666666666666666, "#ed7953"],
                                      [0.7777777777777778, "#fb9f3a"],
                                      [0.8888888888888888, "#fdca26"],
                                      [1.0, "#f0f921"]],
                                     "type":
                                     "heatmapgl"
                                 }],
                                 "histogram": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "histogram"
                                 }],
                                 "histogram2d": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "colorscale":
                                     [[0.0, "#0d0887"],
                                      [0.1111111111111111, "#46039f"],
                                      [0.2222222222222222, "#7201a8"],
                                      [0.3333333333333333, "#9c179e"],
                                      [0.4444444444444444, "#bd3786"],
                                      [0.5555555555555556, "#d8576b"],
                                      [0.6666666666666666, "#ed7953"],
                                      [0.7777777777777778, "#fb9f3a"],
                                      [0.8888888888888888, "#fdca26"],
                                      [1.0, "#f0f921"]],
                                     "type":
                                     "histogram2d"
                                 }],
                                 "histogram2dcontour": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "colorscale":
                                     [[0.0, "#0d0887"],
                                      [0.1111111111111111, "#46039f"],
                                      [0.2222222222222222, "#7201a8"],
                                      [0.3333333333333333, "#9c179e"],
                                      [0.4444444444444444, "#bd3786"],
                                      [0.5555555555555556, "#d8576b"],
                                      [0.6666666666666666, "#ed7953"],
                                      [0.7777777777777778, "#fb9f3a"],
                                      [0.8888888888888888, "#fdca26"],
                                      [1.0, "#f0f921"]],
                                     "type":
                                     "histogram2dcontour"
                                 }],
                                 "mesh3d": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "type": "mesh3d"
                                 }],
                                 "parcoords": [{
                                     "line": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "parcoords"
                                 }],
                                 "pie": [{
                                     "automargin": True,
                                     "type": "pie"
                                 }],
                                 "scatter": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scatter"
                                 }],
                                 "scatter3d": [{
                                     "line": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scatter3d"
                                 }],
                                 "scattercarpet": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scattercarpet"
                                 }],
                                 "scattergeo": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scattergeo"
                                 }],
                                 "scattergl": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scattergl"
                                 }],
                                 "scattermapbox": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scattermapbox"
                                 }],
                                 "scatterpolar": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scatterpolar"
                                 }],
                                 "scatterpolargl": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scatterpolargl"
                                 }],
                                 "scatterternary": [{
                                     "marker": {
                                         "colorbar": {
                                             "outlinewidth": 0,
                                             "ticks": ""
                                         }
                                     },
                                     "type": "scatterternary"
                                 }],
                                 "surface": [{
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     },
                                     "colorscale":
                                     [[0.0, "#0d0887"],
                                      [0.1111111111111111, "#46039f"],
                                      [0.2222222222222222, "#7201a8"],
                                      [0.3333333333333333, "#9c179e"],
                                      [0.4444444444444444, "#bd3786"],
                                      [0.5555555555555556, "#d8576b"],
                                      [0.6666666666666666, "#ed7953"],
                                      [0.7777777777777778, "#fb9f3a"],
                                      [0.8888888888888888, "#fdca26"],
                                      [1.0, "#f0f921"]],
                                     "type":
                                     "surface"
                                 }],
                                 "table": [{
                                     "cells": {
                                         "fill": {
                                             "color": "#EBF0F8"
                                         },
                                         "line": {
                                             "color": "white"
                                         }
                                     },
                                     "header": {
                                         "fill": {
                                             "color": "#C8D4E3"
                                         },
                                         "line": {
                                             "color": "white"
                                         }
                                     },
                                     "type": "table"
                                 }]
                             },
                             "layout": {
                                 "annotationdefaults": {
                                     "arrowcolor": "#2a3f5f",
                                     "arrowhead": 0,
                                     "arrowwidth": 1
                                 },
                                 "coloraxis": {
                                     "colorbar": {
                                         "outlinewidth": 0,
                                         "ticks": ""
                                     }
                                 },
                                 "colorscale": {
                                     "diverging":
                                     [[0, "#8e0152"], [0.1, "#c51b7d"],
                                      [0.2, "#de77ae"], [0.3, "#f1b6da"],
                                      [0.4, "#fde0ef"], [0.5, "#f7f7f7"],
                                      [0.6, "#e6f5d0"], [0.7, "#b8e186"],
                                      [0.8, "#7fbc41"], [0.9, "#4d9221"],
                                      [1, "#276419"]],
                                     "sequential":
                                     [[0.0, "#0d0887"],
                                      [0.1111111111111111, "#46039f"],
                                      [0.2222222222222222, "#7201a8"],
                                      [0.3333333333333333, "#9c179e"],
                                      [0.4444444444444444, "#bd3786"],
                                      [0.5555555555555556, "#d8576b"],
                                      [0.6666666666666666, "#ed7953"],
                                      [0.7777777777777778, "#fb9f3a"],
                                      [0.8888888888888888, "#fdca26"],
                                      [1.0, "#f0f921"]],
                                     "sequentialminus":
                                     [[0.0, "#0d0887"],
                                      [0.1111111111111111, "#46039f"],
                                      [0.2222222222222222, "#7201a8"],
                                      [0.3333333333333333, "#9c179e"],
                                      [0.4444444444444444, "#bd3786"],
                                      [0.5555555555555556, "#d8576b"],
                                      [0.6666666666666666, "#ed7953"],
                                      [0.7777777777777778, "#fb9f3a"],
                                      [0.8888888888888888, "#fdca26"],
                                      [1.0, "#f0f921"]]
                                 },
                                 "colorway": [
                                     "#636efa", "#EF553B", "#00cc96",
                                     "#ab63fa", "#FFA15A", "#19d3f3",
                                     "#FF6692", "#B6E880", "#FF97FF",
                                     "#FECB52"
                                 ],
                                 "font": {
                                     "color": "#2a3f5f"
                                 },
                                 "geo": {
                                     "bgcolor": "white",
                                     "lakecolor": "white",
                                     "landcolor": "#E5ECF6",
                                     "showlakes": True,
                                     "showland": True,
                                     "subunitcolor": "white"
                                 },
                                 "hoverlabel": {
                                     "align": "left"
                                 },
                                 "hovermode":
                                 "closest",
                                 "mapbox": {
                                     "style": "light"
                                 },
                                 "paper_bgcolor":
                                 "white",
                                 "plot_bgcolor":
                                 "#E5ECF6",
                                 "polar": {
                                     "angularaxis": {
                                         "gridcolor": "white",
                                         "linecolor": "white",
                                         "ticks": ""
                                     },
                                     "bgcolor": "#E5ECF6",
                                     "radialaxis": {
                                         "gridcolor": "white",
                                         "linecolor": "white",
                                         "ticks": ""
                                     }
                                 },
                                 "scene": {
                                     "xaxis": {
                                         "backgroundcolor": "#E5ECF6",
                                         "gridcolor": "white",
                                         "gridwidth": 2,
                                         "linecolor": "white",
                                         "showbackground": True,
                                         "ticks": "",
                                         "zerolinecolor": "white"
                                     },
                                     "yaxis": {
                                         "backgroundcolor": "#E5ECF6",
                                         "gridcolor": "white",
                                         "gridwidth": 2,
                                         "linecolor": "white",
                                         "showbackground": True,
                                         "ticks": "",
                                         "zerolinecolor": "white"
                                     },
                                     "zaxis": {
                                         "backgroundcolor": "#E5ECF6",
                                         "gridcolor": "white",
                                         "gridwidth": 2,
                                         "linecolor": "white",
                                         "showbackground": True,
                                         "ticks": "",
                                         "zerolinecolor": "white"
                                     }
                                 },
                                 "shapedefaults": {
                                     "line": {
                                         "color": "#2a3f5f"
                                     }
                                 },
                                 "ternary": {
                                     "aaxis": {
                                         "gridcolor": "white",
                                         "linecolor": "white",
                                         "ticks": ""
                                     },
                                     "baxis": {
                                         "gridcolor": "white",
                                         "linecolor": "white",
                                         "ticks": ""
                                     },
                                     "bgcolor": "#E5ECF6",
                                     "caxis": {
                                         "gridcolor": "white",
                                         "linecolor": "white",
                                         "ticks": ""
                                     }
                                 },
                                 "title": {
                                     "x": 0.05
                                 },
                                 "xaxis": {
                                     "automargin": True,
                                     "gridcolor": "white",
                                     "linecolor": "white",
                                     "ticks": "",
                                     "title": {
                                         "standoff": 15
                                     },
                                     "zerolinecolor": "white",
                                     "zerolinewidth": 2
                                 },
                                 "yaxis": {
                                     "automargin": True,
                                     "gridcolor": "white",
                                     "linecolor": "white",
                                     "ticks": "",
                                     "title": {
                                         "standoff": 15
                                     },
                                     "zerolinecolor": "white",
                                     "zerolinewidth": 2
                                 }
                             }
                         },
                         "xaxis": {
                             "title": {
                                 "text": "Timestamp"
                             }
                         },
                         "yaxis": {
                             "title": {
                                 "text": "holiday"
                             }
                         }
                     }
                 })
         ],
     )
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            target_names = None

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if prediction_column is not None and target_column is not None:
            binaraizer = preprocessing.LabelBinarizer()
            binaraizer.fit(reference_data[target_column])
            binaraized_target = binaraizer.transform(reference_data[target_column])
            if production_data is not None:
                ref_array_prediction = reference_data[prediction_column].to_numpy()
                ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1)
                ref_prediction_labels = [prediction_column[x] for x in ref_prediction_ids]
                reference_data['prediction_labels'] = ref_prediction_labels

                prod_array_prediction = production_data[prediction_column].to_numpy()
                prod_prediction_ids = np.argmax(prod_array_prediction, axis=-1)
                prod_prediction_labels = [prediction_column[x] for x in prod_prediction_ids]
                production_data['prediction_labels'] = prod_prediction_labels

                additional_graphs_data = []
                params_data = []

                for feature_name in num_feature_names + cat_feature_names: 
                    #add data for table in params
                    labels = prediction_column

                    params_data.append(
                        {
                            "details": {
                                    "parts": [{"title":"All", "id":"All" + "_" + str(feature_name)}] + [{"title":str(label), "id": feature_name + "_" + str(label)} for label in labels],
                                    "insights": []
                                },
                            "f1": feature_name
                        }
                        )

                    #create confusion based plots 
                    reference_data['dataset'] = 'Reference'
                    production_data['dataset'] = 'Current'
                    merged_data = pd.concat([reference_data, production_data])

                    fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", histnorm = '',
                        category_orders={"dataset": ["Reference", "Current"]})

                    fig_json  = json.loads(fig.to_json())

                    #write plot data in table as additional data
                    additional_graphs_data.append(
                        AdditionalGraphInfo(
                            "All" + "_" + str(feature_name),
                            {
                                "data" : fig_json['data'],
                                "layout" : fig_json['layout']
                            }, 
                        )
                    )

                    for label in labels:
                        fig = make_subplots(rows=1, cols=2, subplot_titles=("Reference", "Current"))

                        #REF 
                        fig.add_trace(go.Scatter(
                            x = reference_data[reference_data[target_column] == label][feature_name],
                            y = reference_data[reference_data[target_column] == label][label],
                            mode = 'markers',
                            name = str(label) + ' (ref)',
                            marker=dict(
                                size=6,
                                color=red 
                                )
                            ),
                            row=1, col=1
                        )

                        fig.add_trace(go.Scatter(
                            x = reference_data[reference_data[target_column] != label][feature_name],
                            y = reference_data[reference_data[target_column] != label][label],
                            mode = 'markers',
                            name = 'other (ref)',
                            marker=dict(
                                size=6,
                                color=grey 
                                )
                            ),
                            row=1, col=1
                        )


                        fig.update_layout(
                            xaxis_title=feature_name,
                            yaxis_title='Probability',
                            xaxis = dict(
                                showticklabels=True
                            ),
                             yaxis = dict(
                                range=(0, 1),
                                showticklabels=True
                            )
                        )

                        #PROD Prediction
                        fig.add_trace(go.Scatter(
                            x = production_data[production_data[target_column] == label][feature_name],
                            y = production_data[production_data[target_column] == label][label],
                            mode = 'markers',
                            name = str(label) + ' (curr)',
                            marker=dict(
                                size=6,
                                color=red #set color equal to a variable
                                )
                            ),
                            row=1, col=2
                        )

                        fig.add_trace(go.Scatter(
                            x = production_data[production_data[target_column] != label][feature_name],
                            y = production_data[production_data[target_column] != label][label],
                            mode = 'markers',
                            name = 'other (curr)',
                            marker=dict(
                                size=6,
                                color=grey #set color equal to a variable
                                )
                            ),
                            row=1, col=2
                        )

                        fig.update_layout(
                            xaxis_title=feature_name,
                            yaxis_title='Probability',
                            xaxis = dict(
                                showticklabels=True
                            ),
                             yaxis = dict(
                                range=(0, 1),
                                showticklabels=True
                            )
                        )

                        # Update xaxis properties
                        fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=1)
                        fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=2)

                        # Update yaxis properties
                        fig.update_yaxes(title_text="Probability", showgrid=True, row=1, col=1)
                        fig.update_yaxes(title_text="Probability", showgrid=True, row=1, col=2)

                        fig_json  = json.loads(fig.to_json())

                        #write plot data in table as additional data
                        additional_graphs_data.append(
                            AdditionalGraphInfo(
                                feature_name + "_" + str(label),
                                {
                                    "data" : fig_json['data'],
                                    "layout" : fig_json['layout']
                                }, 
                            )
                        )

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_table",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=2,
                    params={
                        "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10),
                        "columns": [
                            {
                                "title": "Feature",
                                "field": "f1"
                            }
                        ],
                        "data": params_data
                    },
                    additionalGraphs=additional_graphs_data
                )

            else:
                ref_array_prediction = reference_data[prediction_column].to_numpy()
                ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1)
                ref_prediction_labels = [prediction_column[x] for x in ref_prediction_ids]
                reference_data['prediction_labels'] = ref_prediction_labels

                additional_graphs_data = []
                params_data = []

                for feature_name in num_feature_names + cat_feature_names: 
                    #add data for table in params
                    labels = prediction_column

                    params_data.append(
                        {
                            "details": {
                                    "parts": [{"title":"All", "id":"All" + "_" + str(feature_name)}] + [{"title":str(label), "id": feature_name + "_" + str(label)} for label in labels],
                                    "insights": []
                                },
                            "f1": feature_name
                        }
                        )

                    #create confusion based plots 
                    fig = px.histogram(reference_data, x=feature_name, color=target_column, histnorm = '')

                    fig_json  = json.loads(fig.to_json())

                    #write plot data in table as additional data
                    additional_graphs_data.append(
                        AdditionalGraphInfo(
                            "All" + "_" + str(feature_name),
                            {
                                "data" : fig_json['data'],
                                "layout" : fig_json['layout']
                            }, 
                        )
                    )

                    for label in labels:

                        fig = go.Figure()

                        fig.add_trace(go.Scatter(
                            x = reference_data[reference_data[target_column] == label][feature_name],
                            y = reference_data[reference_data[target_column] == label][label],
                            mode = 'markers',
                            name = str(label),
                            marker=dict(
                                size=6,
                                color=red #set color equal to a variable
                            )
                        ))

                        fig.add_trace(go.Scatter(
                            x = reference_data[reference_data[target_column] != label][feature_name],
                            y = reference_data[reference_data[target_column] != label][label],
                            mode = 'markers',
                            name = 'other',
                            marker=dict(
                                size=6,
                                color=grey 
                            )
                        ))


                        fig.update_layout(
                            xaxis_title=feature_name,
                            yaxis_title='Probability',
                            xaxis = dict(
                                showticklabels=True
                            ),
                             yaxis = dict(
                                range=(0, 1),
                                showticklabels=True
                            )
                        )

                        fig_json  = json.loads(fig.to_json())

                        #write plot data in table as additional data
                        additional_graphs_data.append(
                            AdditionalGraphInfo(
                                feature_name + "_" + str(label),
                                {
                                    "data" : fig_json['data'],
                                    "layout" : fig_json['layout']
                                }, 
                            )
                        )

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_table",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=2,
                    params={
                        "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10),
                        "columns": [
                            {
                                "title": "Feature",
                                "field": "f1"
                            }
                        ],
                        "data": params_data
                    },
                    additionalGraphs=additional_graphs_data
                )  
        else:
            self.wi = None
Ejemplo n.º 16
0
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

            #target_names = None

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            array_prediction = reference_data[prediction_column].to_numpy()

            prediction_ids = np.argmax(array_prediction, axis=-1)
            prediction_labels = [prediction_column[x] for x in prediction_ids]
            
            #plot support bar
            graphs = []

            for label in prediction_column:

                pred_distr = ff.create_distplot(
                    [
                        reference_data[reference_data[target_column] == label][label], 
                        reference_data[reference_data[target_column] != label][label]
                    ], 
                    [str(label), "other"],  
                    colors=[red, grey],
                    bin_size = 0.05,
                    show_curve = False,
                    show_rug=True
                )

                pred_distr.update_layout(
                    xaxis_title = "Probability",
                    yaxis_title = "Share",
                    legend = dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02,
                    xanchor="right",
                    x=1
                    )
                )

                pred_distr_json = json.loads(pred_distr.to_json())

                graphs.append({
                    "id": "tab_" + str(label),
                    "title": str(label),
                    "graph":{
                        "data":pred_distr_json["data"],
                        "layout":pred_distr_json["layout"],
                        }
                    })

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="tabbed_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1 if production_data is not None else 2,
                params={
                    "graphs": graphs
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Ejemplo n.º 17
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if target_column is not None:
            #calculate output drift
            ref_feature_vc = reference_data[target_column][np.isfinite(
                reference_data[target_column])].value_counts()
            prod_feature_vc = production_data[target_column][np.isfinite(
                production_data[target_column])].value_counts()

            keys = set(
                list(reference_data[target_column][np.isfinite(
                    reference_data[target_column])].unique()) +
                list(production_data[target_column][np.isfinite(
                    production_data[target_column])].unique()))

            ref_feature_dict = dict.fromkeys(keys, 0)
            for key, item in zip(ref_feature_vc.index, ref_feature_vc.values):
                ref_feature_dict[key] = item

            prod_feature_dict = dict.fromkeys(keys, 0)
            for key, item in zip(prod_feature_vc.index,
                                 prod_feature_vc.values):
                prod_feature_dict[key] = item

            f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
            f_obs = [value[1] for value in sorted(prod_feature_dict.items())]

            target_p_value = chisquare(f_exp, f_obs)[1]

            target_sim_test = "detected" if target_p_value < 0.05 else "not detected"

            #plot output distributions
            fig = go.Figure()

            fig.add_trace(
                go.Histogram(x=reference_data[target_column],
                             marker_color=grey,
                             opacity=0.6,
                             nbinsx=10,
                             name='Reference',
                             histnorm='probability'))

            fig.add_trace(
                go.Histogram(x=production_data[target_column],
                             marker_color=red,
                             opacity=0.6,
                             nbinsx=10,
                             name='Production',
                             histnorm='probability'))

            fig.update_layout(legend=dict(orientation="h",
                                          yanchor="bottom",
                                          y=1.02,
                                          xanchor="right",
                                          x=1),
                              xaxis_title=target_column,
                              yaxis_title="Share")

            target_drift_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title="Target Drift: " + target_sim_test + ", p_value=" +
                str(round(target_p_value, 6)),
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "data": target_drift_json['data'],
                    "layout": target_drift_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Ejemplo n.º 18
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        #set params data
        params_data = []
        drifted_fetures_count = 0
        #plt.ioff()
        for feature_name in num_feature_names:  # + cat_feature_names: #feature_names:
            prod_small_hist = np.histogram(
                production_data[feature_name][np.isfinite(
                    production_data[feature_name])],
                bins=10,
                density=True)
            ref_small_hist = np.histogram(
                reference_data[feature_name][np.isfinite(
                    reference_data[feature_name])],
                bins=10,
                density=True)

            feature_type = 'num'

            p_value = ks_2samp(reference_data[feature_name],
                               production_data[feature_name])[1]

            distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected"
            drifted_fetures_count += 1 if p_value < 0.05 else 0

            params_data.append({
                "details": {
                    "parts": [{
                        "title": "Data drift",
                        "id": feature_name + "_drift",
                        "type": "widget"
                    }, {
                        "title": "Data distribution",
                        "id": feature_name + "_distr"
                    }],
                    "insights": []
                },
                "f1": feature_name,
                "f6": feature_type,
                "f3": {
                    "x": list(ref_small_hist[1]),
                    "y": list(ref_small_hist[0])
                },
                "f4": {
                    "x": list(prod_small_hist[1]),
                    "y": list(prod_small_hist[0])
                },
                "f2": distr_sim_test,
                "f5": round(p_value, 6)
            })

        for feature_name in cat_feature_names:  #feature_names:
            prod_small_hist = np.histogram(
                production_data[feature_name][np.isfinite(
                    production_data[feature_name])],
                bins=10,
                density=True)
            ref_small_hist = np.histogram(
                reference_data[feature_name][np.isfinite(
                    reference_data[feature_name])],
                bins=10,
                density=True)

            feature_type = 'cat'

            #p_value = ks_2samp(reference_data[feature_name], production_data[feature_name])[1]
            #CHI2 to be implemented for cases with different categories
            ref_feature_vc = reference_data[feature_name][np.isfinite(
                reference_data[feature_name])].value_counts()
            prod_feature_vc = production_data[feature_name][np.isfinite(
                production_data[feature_name])].value_counts()

            keys = set(
                list(reference_data[feature_name][np.isfinite(
                    reference_data[feature_name])].unique()) +
                list(production_data[feature_name][np.isfinite(
                    production_data[feature_name])].unique()))

            ref_feature_dict = dict.fromkeys(keys, 0)
            for key, item in zip(ref_feature_vc.index, ref_feature_vc.values):
                ref_feature_dict[key] = item

            prod_feature_dict = dict.fromkeys(keys, 0)
            for key, item in zip(prod_feature_vc.index,
                                 prod_feature_vc.values):
                prod_feature_dict[key] = item

            f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
            f_obs = [value[1] for value in sorted(prod_feature_dict.items())]

            p_value = chisquare(f_exp, f_obs)[1]

            distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected"
            drifted_fetures_count += 1 if p_value < 0.05 else 0

            params_data.append({
                "details": {
                    "parts": [{
                        "title": "Data drift",
                        "id": feature_name + "_drift",
                        "type": "widget"
                    }, {
                        "title": "Data distribution",
                        "id": feature_name + "_distr"
                    }],
                    "insights": []
                },
                "f1": feature_name,
                "f6": feature_type,
                "f3": {
                    "x": list(ref_small_hist[1]),
                    "y": list(ref_small_hist[0])
                },
                "f4": {
                    "x": list(prod_small_hist[1]),
                    "y": list(prod_small_hist[0])
                },
                "f2": distr_sim_test,
                "f5": round(p_value, 6)
            })

        #set additionalGraphs
        additional_graphs_data = []
        for feature_name in num_feature_names + cat_feature_names:  #feature_names:

            #plot distributions
            fig = go.Figure()
            fig.add_trace(
                go.Histogram(x=reference_data[feature_name],
                             marker_color=grey,
                             opacity=0.6,
                             nbinsx=10,
                             name='Reference',
                             histnorm='probability'))

            fig.add_trace(
                go.Histogram(x=production_data[feature_name],
                             marker_color=red,
                             opacity=0.6,
                             nbinsx=10,
                             name='Current',
                             histnorm='probability'))

            fig.update_layout(legend=dict(orientation="h",
                                          yanchor="bottom",
                                          y=1.02,
                                          xanchor="right",
                                          x=1),
                              xaxis_title=feature_name,
                              yaxis_title="Share")

            distr_figure = json.loads(fig.to_json())

            #plot drift
            reference_mean = np.mean(reference_data[feature_name][np.isfinite(
                reference_data[feature_name])])
            reference_std = np.std(reference_data[feature_name][np.isfinite(
                reference_data[feature_name])],
                                   ddof=1)
            x_title = "Timestamp" if date_column else "Index"

            fig = go.Figure()

            fig.add_trace(
                go.Scatter(x=production_data[date_column]
                           if date_column else production_data.index,
                           y=production_data[feature_name],
                           mode='markers',
                           name='Current',
                           marker=dict(size=6, color=grey)))

            fig.update_layout(
                xaxis_title=x_title,
                yaxis_title=feature_name,
                showlegend=True,
                legend=dict(orientation="h",
                            yanchor="bottom",
                            y=1.02,
                            xanchor="right",
                            x=1),
                shapes=[
                    dict(
                        type="rect",
                        # x-reference is assigned to the x-values
                        xref="paper",
                        # y-reference is assigned to the plot paper [0,1]
                        yref="y",
                        x0=0,
                        y0=reference_mean - reference_std,
                        x1=1,
                        y1=reference_mean + reference_std,
                        fillcolor="LightGreen",
                        opacity=0.5,
                        layer="below",
                        line_width=0,
                    ),
                    dict(
                        type="line",
                        name='Reference',
                        xref="paper",
                        yref="y",
                        x0=0,  #min(testset_agg_by_date.index),
                        y0=reference_mean,
                        x1=1,  #max(testset_agg_by_date.index),
                        y1=reference_mean,
                        line=dict(color="Green", width=3)),
                ])

            drift_figure = json.loads(fig.to_json())

            #add distributions data
            additional_graphs_data.append(
                AdditionalGraphInfo(feature_name + '_distr', {
                    "data": distr_figure['data'],
                    "layout": distr_figure['layout']
                }))

            #add drift data
            additional_graphs_data.append(
                AdditionalGraphInfo(
                    feature_name + '_drift', {
                        "title": "",
                        "size": 2,
                        "text": "",
                        "type": "big_graph",
                        "params": {
                            "data": drift_figure['data'],
                            "layout": drift_figure['layout']
                        }
                    }))

        self.wi = BaseWidgetInfo(
            title="Data Drift: drift detected for " +
            str(drifted_fetures_count) + " out of " +
            str(len(num_feature_names) + len(cat_feature_names)) + " features",
            type="big_table",
            details="",
            alertStats=AlertStats(),
            alerts=[],
            alertsPosition="row",
            insights=[],
            size=2,
            params={
                "rowsPerPage":
                min(len(num_feature_names) + len(cat_feature_names), 10),
                "columns": [{
                    "title": "Feature",
                    "field": "f1"
                }, {
                    "title": "Type",
                    "field": "f6"
                }, {
                    "title": "Reference Distribution",
                    "field": "f3",
                    "type": "histogram",
                    "options": {
                        "xField": "x",
                        "yField": "y"
                    }
                }, {
                    "title": "Current Distribution",
                    "field": "f4",
                    "type": "histogram",
                    "options": {
                        "xField": "x",
                        "yField": "y"
                    }
                }, {
                    "title": "Data drift",
                    "field": "f2"
                }, {
                    "title": "P-Value for Similarity Test",
                    "field": "f5",
                    "sort": "asc"
                }],
                "data":
                params_data
            },
            additionalGraphs=additional_graphs_data)
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if production_data is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            ref_error = reference_data[prediction_column] - reference_data[
                target_column]
            prod_error = production_data[prediction_column] - production_data[
                target_column]

            ref_quntile_5 = np.quantile(ref_error, .05)
            ref_quntile_95 = np.quantile(ref_error, .95)

            prod_quntile_5 = np.quantile(prod_error, .05)
            prod_quntile_95 = np.quantile(prod_error, .95)

            #create subplots
            reference_data['dataset'] = 'Reference'
            reference_data['Error bias'] = list(
                map(
                    lambda x: 'Underestimation'
                    if x <= ref_quntile_5 else 'Majority'
                    if x < ref_quntile_95 else 'Overestimation', ref_error))

            production_data['dataset'] = 'Current'
            production_data['Error bias'] = list(
                map(
                    lambda x: 'Underestimation'
                    if x <= prod_quntile_5 else 'Majority'
                    if x < prod_quntile_95 else 'Overestimation', prod_error))
            merged_data = pd.concat([reference_data, production_data])

            reference_data.drop(['dataset', 'Error bias'],
                                axis=1,
                                inplace=True)
            production_data.drop(['dataset', 'Error bias'],
                                 axis=1,
                                 inplace=True)

            params_data = []
            additional_graphs_data = []

            for feature_name in num_feature_names:
                feature_type = 'num'

                ref_overal_value = np.mean(reference_data[feature_name])
                ref_under_value = np.mean(
                    reference_data[ref_error <= ref_quntile_5][feature_name])
                ref_expected_value = np.mean(
                    reference_data[(ref_error > ref_quntile_5) &
                                   (ref_error < ref_quntile_95)][feature_name])
                ref_over_value = np.mean(
                    reference_data[ref_error >= ref_quntile_95][feature_name])
                ref_range_value = 0 if ref_over_value == ref_under_value else 100 * abs(
                    ref_over_value -
                    ref_under_value) / (np.max(reference_data[feature_name]) -
                                        np.min(reference_data[feature_name]))

                prod_overal_value = np.mean(production_data[feature_name])
                prod_under_value = np.mean(production_data[
                    prod_error <= prod_quntile_5][feature_name])
                prod_expected_value = np.mean(production_data[
                    (prod_error > prod_quntile_5)
                    & (prod_error < prod_quntile_95)][feature_name])
                prod_over_value = np.mean(production_data[
                    prod_error >= prod_quntile_95][feature_name])
                prod_range_value = 0 if prod_over_value == prod_under_value else 100 * abs(
                    prod_over_value - prod_under_value) / (
                        np.max(production_data[feature_name]) -
                        np.min(production_data[feature_name]))

                feature_hist = px.histogram(
                    merged_data,
                    x=feature_name,
                    color='Error bias',
                    facet_col="dataset",
                    histnorm='percent',
                    barmode='overlay',
                    category_orders={
                        "dataset": ["Reference", "Current"],
                        "Error bias":
                        ["Underestimation", "Overestimation", "Majority"]
                    })

                feature_hist_json = json.loads(feature_hist.to_json())

                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Error bias",
                            "id": feature_name + "_hist"
                        }],
                        "insights": []
                    },
                    "f1": feature_name,
                    "f2": feature_type,
                    "f3": round(ref_expected_value, 2),
                    "f4": round(ref_under_value, 2),
                    "f5": round(ref_over_value, 2),
                    "f6": round(ref_range_value, 2),
                    "f7": round(prod_expected_value, 2),
                    "f8": round(prod_under_value, 2),
                    "f9": round(prod_over_value, 2),
                    "f10": round(prod_range_value, 2)
                })

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_hist', {
                            "data": feature_hist_json['data'],
                            "layout": feature_hist_json['layout']
                        }))

            for feature_name in cat_feature_names:
                feature_type = 'cat'

                ref_overal_value = reference_data[feature_name].value_counts(
                ).idxmax()
                ref_under_value = reference_data[ref_error <= ref_quntile_5][
                    feature_name].value_counts().idxmax()
                #ref_expected_value = reference_data[(ref_error > ref_quntile_5) & (ref_error < ref_quntile_95)][feature_name].value_counts().idxmax()
                ref_over_value = reference_data[ref_error >= ref_quntile_95][
                    feature_name].value_counts().idxmax()
                ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \
                   or (ref_under_value != ref_overal_value) else 0

                prod_overal_value = production_data[feature_name].value_counts(
                ).idxmax()
                prod_under_value = production_data[
                    prod_error <= prod_quntile_5][feature_name].value_counts(
                    ).idxmax()
                #prod_expected_value = production_data[(prod_error > prod_quntile_5) & (prod_error < prod_quntile_95)][feature_name].value_counts().idxmax()
                prod_over_value = production_data[
                    prod_error >= prod_quntile_95][feature_name].value_counts(
                    ).idxmax()
                prod_range_value = 1 if (prod_overal_value != prod_under_value) or (prod_over_value != prod_overal_value) \
                   or (prod_under_value != prod_overal_value) else 0

                feature_hist = px.histogram(
                    merged_data,
                    x=feature_name,
                    color='Error bias',
                    facet_col="dataset",
                    histnorm='percent',
                    barmode='overlay',
                    category_orders={
                        "dataset": ["Reference", "Current"],
                        "Error bias":
                        ["Underestimation", "Overestimation", "Majority"]
                    })

                feature_hist_json = json.loads(feature_hist.to_json())

                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Error bias",
                            "id": feature_name + "_hist"
                        }],
                        "insights": []
                    },
                    "f1": feature_name,
                    "f2": feature_type,
                    "f3": str(ref_overal_value),
                    "f4": str(ref_under_value),
                    "f5": str(ref_over_value),
                    "f6": str(ref_range_value),
                    "f7": str(prod_overal_value),
                    "f8": str(prod_under_value),
                    "f9": str(prod_over_value),
                    "f10": int(prod_range_value)
                })

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_hist', {
                            "data": feature_hist_json['data'],
                            "layout": feature_hist_json['layout']
                        }))

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_table",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "rowsPerPage":
                    min(len(num_feature_names) + len(cat_feature_names), 10),
                    "columns": [{
                        "title": "Feature",
                        "field": "f1"
                    }, {
                        "title": "Type",
                        "field": "f2"
                    }, {
                        "title": "REF: Majority",
                        "field": "f3"
                    }, {
                        "title": "REF: Under",
                        "field": "f4"
                    }, {
                        "title": "REF: Over",
                        "field": "f5"
                    }, {
                        "title": "REF: Range(%)",
                        "field": "f6"
                    }, {
                        "title": "CURR: Majority",
                        "field": "f7"
                    }, {
                        "title": "CURR: Under",
                        "field": "f8"
                    }, {
                        "title": "CURR: Over",
                        "field": "f9"
                    }, {
                        "title": "CURR: Range(%)",
                        "field": "f10",
                        "sort": "desc"
                    }],
                    "data":
                    params_data
                },
                additionalGraphs=additional_graphs_data)

        else:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            error = reference_data[prediction_column] - reference_data[
                target_column]

            quntile_5 = np.quantile(error, .05)
            quntile_95 = np.quantile(error, .95)

            reference_data['Error bias'] = reference_data['Error bias'] = list(
                map(
                    lambda x: 'Underestimation'
                    if x <= quntile_5 else 'Majority'
                    if x < quntile_95 else 'Overestimation', error))

            params_data = []
            additional_graphs_data = []

            for feature_name in num_feature_names:  # + cat_feature_names: #feature_names:

                feature_type = 'num'
                ref_overal_value = np.mean(reference_data[feature_name])
                ref_under_value = np.mean(
                    reference_data[error <= quntile_5][feature_name])
                #ref_expected_value = np.mean(reference_data[(error > quntile_5) & (error < quntile_95)][feature_name])
                ref_over_value = np.mean(
                    reference_data[error >= quntile_95][feature_name])
                ref_range_value = 0 if ref_over_value == ref_under_value else 100 * abs(
                    ref_over_value -
                    ref_under_value) / (np.max(reference_data[feature_name]) -
                                        np.min(reference_data[feature_name]))

                hist = px.histogram(
                    reference_data,
                    x=feature_name,
                    color='Error bias',
                    histnorm='percent',
                    barmode='overlay',
                    category_orders={
                        "Error bias":
                        ["Underestimation", "Overestimation", "Majority"]
                    })

                #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset",
                #        category_orders={"dataset": ["Reference", "Production"]})

                hist_figure = json.loads(hist.to_json())

                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Error bias",
                            "id": feature_name + "_hist"
                        }],
                        "insights": []
                    },
                    "f1": feature_name,
                    "f2": feature_type,
                    "f3": round(ref_overal_value, 2),
                    "f4": round(ref_under_value, 2),
                    "f5": round(ref_over_value, 2),
                    "f6": round(ref_range_value, 2)
                })

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_hist', {
                            "data": hist_figure['data'],
                            "layout": hist_figure['layout']
                        }))

            for feature_name in cat_feature_names:  #feature_names:

                feature_type = 'cat'
                ref_overal_value = reference_data[feature_name].value_counts(
                ).idxmax()
                ref_under_value = reference_data[
                    error <= quntile_5][feature_name].value_counts().idxmax()
                #ref_expected_value = reference_data[(error > quntile_5) & (error < quntile_95)][feature_name].value_counts().idxmax()
                ref_over_value = reference_data[
                    error >= quntile_95][feature_name].value_counts().idxmax()
                ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \
                   or (ref_under_value != ref_overal_value) else 0

                hist = px.histogram(
                    reference_data,
                    x=feature_name,
                    color='Error bias',
                    histnorm='percent',
                    barmode='overlay',
                    category_orders={
                        "Error bias":
                        ["Underestimation", "Overestimation", "Majority"]
                    })

                #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset",
                #        category_orders={"dataset": ["Reference", "Production"]})

                hist_figure = json.loads(hist.to_json())

                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Error bias",
                            "id": feature_name + "_hist"
                        }],
                        "insights": []
                    },
                    "f1": feature_name,
                    "f2": feature_type,
                    "f3": str(ref_overal_value),
                    "f4": str(ref_under_value),
                    "f5": str(ref_over_value),
                    "f6": int(ref_range_value)
                })

                additional_graphs_data.append(
                    AdditionalGraphInfo(
                        feature_name + '_hist', {
                            "data": hist_figure['data'],
                            "layout": hist_figure['layout']
                        }))

            reference_data.drop('Error bias', axis=1, inplace=True)

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_table",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "rowsPerPage":
                    min(len(num_feature_names) + len(cat_feature_names), 10),
                    "columns": [{
                        "title": "Feature",
                        "field": "f1"
                    }, {
                        "title": "Type",
                        "field": "f2"
                    }, {
                        "title": "Majority",
                        "field": "f3"
                    }, {
                        "title": "Underestimation",
                        "field": "f4"
                    }, {
                        "title": "Overestimation",
                        "field": "f5"
                    }, {
                        "title": "Range(%)",
                        "field": "f6",
                        "sort": "desc"
                    }],
                    "data":
                    params_data
                },
                additionalGraphs=additional_graphs_data)
Ejemplo n.º 20
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if production_data is not None:
            if target_column is not None and prediction_column is not None:
                production_data.replace([np.inf, -np.inf],
                                        np.nan,
                                        inplace=True)
                production_data.dropna(axis=0, how='any', inplace=True)

                #plot output correlations
                pred_actual_time = go.Figure()

                error_trace = go.Scatter(
                    x=production_data[date_column]
                    if date_column else production_data.index,
                    y=production_data[prediction_column] -
                    production_data[target_column],
                    mode='lines',
                    name='Predicted - Actual',
                    marker=dict(size=6, color=red))

                zero_trace = go.Scatter(
                    x=production_data[date_column]
                    if date_column else production_data.index,
                    y=[0] * production_data.shape[0],
                    mode='lines',
                    opacity=0.5,
                    marker=dict(
                        size=6,
                        color='green',
                    ),
                    showlegend=False,
                )

                pred_actual_time.add_trace(error_trace)
                pred_actual_time.add_trace(zero_trace)

                pred_actual_time.update_layout(
                    xaxis_title="Timestamp" if date_column else "Index",
                    yaxis_title="Error",
                    legend=dict(orientation="h",
                                yanchor="bottom",
                                y=1.02,
                                xanchor="right",
                                x=1))

                pred_actual_time_json = json.loads(pred_actual_time.to_json())

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1,
                    params={
                        "data": pred_actual_time_json['data'],
                        "layout": pred_actual_time_json['layout']
                    },
                    additionalGraphs=[],
                )
            else:
                self.wi = None
        else:
            self.wi = None
Ejemplo n.º 21
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            #target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            #array_prediction = reference_data[prediction_column].to_numpy()

            #prediction_ids = np.argmax(array_prediction, axis=-1)
            #prediction_labels = [prediction_column[x] for x in prediction_ids]
            if len(prediction_column) <= 2:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(production_data[target_column]))
                binaraized_target.columns = ['target']

                params_data = []

                step_size = 0.05
                binded = list(
                    zip(binaraized_target['target'].tolist(),
                        production_data[prediction_column[0]].tolist()))
                binded.sort(key=lambda item: item[1], reverse=True)

                data_size = len(binded)
                target_class_size = sum([x[0] for x in binded])

                #result = pd.DataFrame(columns = ['Top(%)', 'Count', 'TP', 'FP', 'precision', 'recall'])

                offset = max(round(data_size * step_size), 1)
                for step in np.arange(offset, data_size + offset, offset):
                    count = min(step, data_size)
                    prob = round(binded[min(step, data_size - 1)][1], 2)
                    top = round(100.0 * min(step, data_size) / data_size, 1)
                    tp = sum([x[0] for x in binded[:min(step, data_size)]])
                    fp = count - tp
                    precision = round(100.0 * tp / count, 1)
                    recall = round(100.0 * tp / target_class_size, 1)

                    params_data.append({
                        'f1': float(top),
                        'f2': int(count),
                        'f3': float(prob),
                        'f4': int(tp),
                        'f5': int(fp),
                        'f6': float(precision),
                        'f7': float(recall)
                    })

                self.wi = BaseWidgetInfo(title=self.title,
                                         type="big_table",
                                         details="",
                                         alertStats=AlertStats(),
                                         alerts=[],
                                         alertsPosition="row",
                                         insights=[],
                                         size=1,
                                         params={
                                             "rowsPerPage":
                                             21,
                                             "columns": [{
                                                 "title": "Top(%)",
                                                 "field": "f1",
                                                 "sort": "asc"
                                             }, {
                                                 "title": "Count",
                                                 "field": "f2",
                                             }, {
                                                 "title": "Prob",
                                                 "field": "f3",
                                             }, {
                                                 "title": "TP",
                                                 "field": "f4"
                                             }, {
                                                 "title": "FP",
                                                 "field": "f5"
                                             }, {
                                                 "title": "Precision",
                                                 "field": "f6"
                                             }, {
                                                 "title": "Recall",
                                                 "field": "f7"
                                             }],
                                             "data":
                                             params_data
                                         },
                                         additionalGraphs=[])
            else:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(production_data[target_column]))
                binaraized_target.columns = prediction_column

                #create tables
                tabs = []

                for label in prediction_column:
                    params_data = []

                    step_size = 0.05
                    binded = list(
                        zip(binaraized_target[label].tolist(),
                            production_data[label].tolist()))
                    binded.sort(key=lambda item: item[1], reverse=True)

                    data_size = len(binded)
                    target_class_size = sum([x[0] for x in binded])

                    #result = pd.DataFrame(columns = ['Top(%)', 'Count', 'TP', 'FP', 'precision', 'recall'])

                    offset = max(round(data_size * step_size), 1)
                    for step in np.arange(offset, data_size + offset, offset):
                        count = min(step, data_size)
                        prob = round(binded[min(step, data_size - 1)][1], 2)
                        top = round(100.0 * min(step, data_size) / data_size,
                                    1)
                        tp = sum([x[0] for x in binded[:min(step, data_size)]])
                        fp = count - tp
                        precision = round(100.0 * tp / count, 1)
                        recall = round(100.0 * tp / target_class_size, 1)

                        params_data.append({
                            'f1': float(top),
                            'f2': int(count),
                            'f3': float(prob),
                            'f4': int(tp),
                            'f5': int(fp),
                            'f6': float(precision),
                            'f7': float(recall)
                        })

                    tabs.append(
                        TabInfo(id=label,
                                title=label,
                                widget=BaseWidgetInfo(title="",
                                                      type="big_table",
                                                      details="",
                                                      alertStats=AlertStats(),
                                                      alerts=[],
                                                      alertsPosition="row",
                                                      insights=[],
                                                      size=2,
                                                      params={
                                                          "rowsPerPage":
                                                          21,
                                                          "columns": [{
                                                              "title":
                                                              "Top(%)",
                                                              "field":
                                                              "f1",
                                                              "sort":
                                                              "asc"
                                                          }, {
                                                              "title":
                                                              "Count",
                                                              "field":
                                                              "f2",
                                                          }, {
                                                              "title":
                                                              "Prob",
                                                              "field":
                                                              "f3",
                                                          }, {
                                                              "title":
                                                              "TP",
                                                              "field":
                                                              "f4"
                                                          }, {
                                                              "title":
                                                              "FP",
                                                              "field":
                                                              "f5"
                                                          }, {
                                                              "title":
                                                              "Precision",
                                                              "field":
                                                              "f6"
                                                          }, {
                                                              "title":
                                                              "Recall",
                                                              "field":
                                                              "f7"
                                                          }],
                                                          "data":
                                                          params_data
                                                      },
                                                      additionalGraphs=[])))

                self.wi = BaseWidgetInfo(type="tabs",
                                         title=self.title,
                                         size=1,
                                         details="",
                                         tabs=tabs)
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            #plot support bar
            metrics_matrix = metrics.classification_report(
                production_data[target_column],
                production_data[prediction_column],
                output_dict=True)
            metrics_frame = pd.DataFrame(metrics_matrix)

            z = metrics_frame.iloc[:-1, :-3].values

            x = target_names if target_names else metrics_frame.columns.tolist(
            )[:-3]

            y = ['precision', 'recall', 'f1-score']

            # change each element of z to type string for annotations
            z_text = [[str(round(y, 3)) for y in x] for x in z]

            # set up figure
            fig = ff.create_annotated_heatmap(z,
                                              x=x,
                                              y=y,
                                              annotation_text=z_text,
                                              colorscale='bluered',
                                              showscale=True)
            fig.update_layout(xaxis_title="Class", yaxis_title="Metric")

            metrics_matrix_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={
                    "data": metrics_matrix_json['data'],
                    "layout": metrics_matrix_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if prediction_column is not None or target_column is not None:
            additional_graphs_data = []
            params_data = []
            for feature_name in num_feature_names + cat_feature_names:
                #add data for table in params
                params_data.append({
                    "details": {
                        "parts": [{
                            "title": "Feature values",
                            "id": feature_name + "_values"
                        }],
                        "insights": []
                    },
                    "f1": feature_name
                })

                #create plot
                fig = make_subplots(rows=1,
                                    cols=2,
                                    subplot_titles=("Reference", "Production"))

                if prediction_column is not None:
                    fig.add_trace(go.Scatter(
                        x=reference_data[feature_name],
                        y=reference_data[prediction_column],
                        mode='markers',
                        name='Prediction (ref)',
                        marker=dict(size=6, color=grey)),
                                  row=1,
                                  col=1)

                if target_column is not None:
                    fig.add_trace(go.Scatter(x=reference_data[feature_name],
                                             y=reference_data[target_column],
                                             mode='markers',
                                             name='Target (ref)',
                                             marker=dict(size=6, color=red)),
                                  row=1,
                                  col=1)

                if prediction_column is not None:
                    fig.add_trace(go.Scatter(
                        x=production_data[feature_name],
                        y=production_data[prediction_column],
                        mode='markers',
                        name='Prediction (prod)',
                        marker=dict(size=6, color=grey)),
                                  row=1,
                                  col=2)

                if target_column is not None:
                    fig.add_trace(go.Scatter(x=production_data[feature_name],
                                             y=production_data[target_column],
                                             mode='markers',
                                             name='Target (prod)',
                                             marker=dict(size=6, color=red)),
                                  row=1,
                                  col=2)

                # Update xaxis properties
                fig.update_xaxes(title_text=feature_name,
                                 showgrid=True,
                                 row=1,
                                 col=1)
                fig.update_xaxes(title_text=feature_name,
                                 showgrid=True,
                                 row=1,
                                 col=2)

                # Update yaxis properties
                fig.update_yaxes(title_text="Value",
                                 showgrid=True,
                                 row=1,
                                 col=1)
                fig.update_yaxes(title_text="Value",
                                 showgrid=True,
                                 row=1,
                                 col=2)

                fig_json = json.loads(fig.to_json())

                #write plot data in table as additional data
                additional_graphs_data.append(
                    AdditionalGraphInfo(feature_name + '_values', {
                        "data": fig_json['data'],
                        "layout": fig_json['layout']
                    }))

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_table",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "rowsPerPage":
                    min(len(num_feature_names) + len(cat_feature_names), 10),
                    "columns": [{
                        "title": "Feature",
                        "field": "f1"
                    }],
                    "data":
                    params_data
                },
                additionalGraphs=additional_graphs_data)

        else:
            self.wi = None
Ejemplo n.º 24
0
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            binaraizer = preprocessing.LabelBinarizer()
            binaraizer.fit(reference_data[target_column])
            binaraized_target = binaraizer.transform(reference_data[target_column])

            array_prediction = reference_data[prediction_column].to_numpy()

            prediction_ids = np.argmax(array_prediction, axis=-1)
            prediction_labels = [prediction_column[x] for x in prediction_ids]

            labels = sorted(set(reference_data[target_column]))

            #calculate quality metrics
            if len(prediction_column) > 2:
                roc_auc = metrics.roc_auc_score(binaraized_target, array_prediction, average='macro')
                log_loss = metrics.log_loss(binaraized_target, array_prediction)
            else:
                roc_auc = metrics.roc_auc_score(binaraized_target, reference_data[prediction_column[0]], #problem!!!
                average='macro')
                log_loss = metrics.log_loss(binaraized_target, reference_data[prediction_column[0]]) #problem!!!

            accuracy_score = metrics.accuracy_score(reference_data[target_column], prediction_labels)
            avg_precision = metrics.precision_score(reference_data[target_column], prediction_labels, 
                average='macro')
            avg_recall = metrics.recall_score(reference_data[target_column], prediction_labels, 
                average='macro')
            avg_f1 = metrics.f1_score(reference_data[target_column], prediction_labels, 
                average='macro')

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="counter",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={   
                    "counters": [
                      {
                        "value": str(round(accuracy_score, 3)),
                        "label": "Accuracy"
                      },
                      {
                        "value": str(round(avg_precision, 3)),
                        "label": "Precision"
                      },
                      {
                        "value": str(round(avg_recall, 3)),
                        "label": "Recall"
                      },
                      {
                        "value": str(round(avg_f1, 3)),
                        "label": "F1"
                      },
                      {
                        "value": str(round(roc_auc, 3)),
                        "label": "ROC AUC"
                      },
                      {
                        "value": str(round(log_loss, 3)),
                        "label": "LogLoss"
                      }
                    ]
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Ejemplo n.º 25
0
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if production_data is not None:
            if target_column is not None and prediction_column is not None:
                production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
                production_data.dropna(axis=0, how='any', inplace=True)
            
                #calculate quality metrics
                accuracy_score = metrics.accuracy_score(production_data[target_column], production_data[prediction_column])
                avg_precision = metrics.precision_score(production_data[target_column], production_data[prediction_column],
                    average='macro')
                avg_recall = metrics.recall_score(production_data[target_column], production_data[prediction_column],
                    average='macro')
                avg_f1 = metrics.f1_score(production_data[target_column], production_data[prediction_column],
                    average='macro')

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="counter",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=2,
                    params={   
                        "counters": [
                          {
                            "value": str(round(accuracy_score, 3)),
                            "label": "Accuracy"
                          },
                          {
                            "value": str(round(avg_precision, 3)),
                            "label": "Precision"
                          },
                          {
                            "value": str(round(avg_recall, 3)),
                            "label": "Recall"
                          },
                          {
                            "value": str(round(avg_f1, 3)),
                            "label": "F1"
                          }
                        ]
                    },
                    additionalGraphs=[],
                )
            else:
                self.wi = None
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if target_column is not None and prediction_column is not None:

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="counter",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "counters": [{
                        "value":
                        "",
                        "label":
                        "Regression Model Performance Report. Target:'" +
                        target_column + "'"
                    }]
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Ejemplo n.º 27
0
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)
            
            #calculate quality metrics
            me = np.mean(reference_data[prediction_column] - reference_data[target_column])
            sde = np.std(reference_data[prediction_column] - reference_data[target_column], ddof = 1)

            abs_err = list(map(lambda x : abs(x[0] - x[1]), 
                zip(reference_data[target_column], reference_data[prediction_column])))
            mae = np.mean(abs_err)
            sdae = np.std(abs_err, ddof = 1)

            abs_perc_err = list(map(lambda x : 100*abs(x[0] - x[1])/x[0], 
                zip(reference_data[target_column], reference_data[prediction_column])))
            mape = np.mean(abs_perc_err)
            sdape = np.std(abs_perc_err, ddof = 1)

            #sqrt_err = list(map(lambda x : (x[0] - x[1])**2, 
            #    zip(reference_data[target_column], reference_data[prediction_column])))
            #mse = np.mean(sqrt_err)
            #sdse = np.std(sqrt_err, ddof = 1)

            #error_norm_json = json.loads(error_norm.to_json())

            self.wi = BaseWidgetInfo(
                title="Reference: Model Quality (+/- std)",
                type="counter",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={   
                    "counters": [
                      {
                        "value": str(round(me, 2)) + " (" + str(round(sde,2)) + ")",
                        "label": "ME"
                      },
                      {
                        "value": str(round(mae, 2)) + " (" + str(round(sdae,2)) + ")",
                        "label": "MAE"
                      },
                      {
                        "value": str(round(mape, 2)) + " (" + str(round(sdape, 2)) + ")",
                        "label": "MAPE"
                      }#,
                      #{
                      #  "value": str(round(mse, 2)) + " (" + str(round(sdse, 2)) + ")",
                      #  "label": "MSE"
                      #}
                    ]
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if production_data is not None:
            if target_column is not None and prediction_column is not None:
                production_data.replace([np.inf, -np.inf],
                                        np.nan,
                                        inplace=True)
                production_data.dropna(axis=0, how='any', inplace=True)

                #plot output correlations
                error_norm = go.Figure()

                error = production_data[prediction_column] - production_data[
                    target_column]
                qq_lines = probplot(error, dist="norm", plot=None)
                theoretical_q_x = np.linspace(qq_lines[0][0][0],
                                              qq_lines[0][0][-1], 100)

                sample_quantile_trace = go.Scatter(x=qq_lines[0][0],
                                                   y=qq_lines[0][1],
                                                   mode='markers',
                                                   name='Dataset Quantiles',
                                                   marker=dict(size=6,
                                                               color=red))

                theoretical_quantile_trace = go.Scatter(
                    x=theoretical_q_x,
                    y=qq_lines[1][0] * theoretical_q_x + qq_lines[1][1],
                    mode='lines',
                    name='Theoretical Quantiles',
                    marker=dict(size=6, color=grey))

                error_norm.add_trace(sample_quantile_trace)
                error_norm.add_trace(theoretical_quantile_trace)

                error_norm.update_layout(xaxis_title="Theoretical Quantiles",
                                         yaxis_title="Dataset Quantiles",
                                         legend=dict(orientation="h",
                                                     yanchor="bottom",
                                                     y=1.02,
                                                     xanchor="right",
                                                     x=1))

                error_norm_json = json.loads(error_norm.to_json())

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1,
                    params={
                        "data": error_norm_json['data'],
                        "layout": error_norm_json['layout']
                    },
                    additionalGraphs=[],
                )
            else:
                self.wi = None
        else:
            self.wi = None
Ejemplo n.º 29
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            #target_names = None

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            #array_prediction = reference_data[prediction_column].to_numpy()

            #prediction_ids = np.argmax(array_prediction, axis=-1)
            #prediction_labels = [prediction_column[x] for x in prediction_ids]
            if len(prediction_column) <= 2:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(reference_data[target_column]))
                binaraized_target.columns = ['target']

                p, r, thrs = metrics.precision_recall_curve(
                    binaraized_target,
                    reference_data[prediction_column[0]])  #problem!!!
                fig = go.Figure()

                fig.add_trace(
                    go.Scatter(x=p,
                               y=r,
                               mode='lines',
                               name='PR',
                               marker=dict(
                                   size=6,
                                   color=red,
                               )))

                fig.update_layout(yaxis_title="Precision",
                                  xaxis_title="Recall",
                                  showlegend=True)

                fig_json = json.loads(fig.to_json())

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1 if production_data is not None else 2,
                    params={
                        "data": fig_json['data'],
                        "layout": fig_json['layout']
                    },
                    additionalGraphs=[],
                )
            else:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(reference_data[target_column]))
                binaraized_target.columns = prediction_column
                #plot support bar
                graphs = []

                for label in prediction_column:
                    p, r, thrs = metrics.precision_recall_curve(
                        binaraized_target[label], reference_data[label])
                    fig = go.Figure()

                    fig.add_trace(
                        go.Scatter(x=p,
                                   y=r,
                                   mode='lines',
                                   name='PR',
                                   marker=dict(
                                       size=6,
                                       color=red,
                                   )))

                    fig.update_layout(yaxis_title="Precision",
                                      xaxis_title="Recall",
                                      showlegend=True)

                    fig_json = json.loads(fig.to_json())

                    graphs.append({
                        "id": "tab_" + str(label),
                        "title": str(label),
                        "graph": {
                            "data": fig_json["data"],
                            "layout": fig_json["layout"],
                        }
                    })

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="tabbed_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1 if production_data is not None else 2,
                    params={"graphs": graphs},
                    additionalGraphs=[],
                )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            target_names = None

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            array_prediction = reference_data[prediction_column].to_numpy()

            prediction_ids = np.argmax(array_prediction, axis=-1)
            prediction_labels = [prediction_column[x] for x in prediction_ids]

            #plot confusion matrix
            conf_matrix = metrics.confusion_matrix(
                reference_data[target_column], prediction_labels)

            z = conf_matrix.astype(int)

            labels = sorted(set(reference_data[target_column]))

            # change each element of z to type string for annotations
            z_text = [[str(y) for y in x] for x in z]

            fig = ff.create_annotated_heatmap(z,
                                              x=labels,
                                              y=labels,
                                              annotation_text=z_text,
                                              colorscale='bluered',
                                              showscale=True)

            fig.update_layout(xaxis_title="Predicted value",
                              yaxis_title="Actual value")

            conf_matrix_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1 if production_data is not None else 2,
                params={
                    "data": conf_matrix_json['data'],
                    "layout": conf_matrix_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None