def calculate(self, reference_data: pandas.DataFrame, production_data: pandas.DataFrame, _: Dict): self.wi = BaseWidgetInfo( type="counter", title=self.title, size=2, params={ "counters": [{ "value": "7 out of 12 features", "label": "Data Drift Detected" }] }, alerts=[], insights=[], details="", alertsPosition="row", alertStats=AlertStats(), additionalGraphs=[], )
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) target_names = None if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = binaraizer.transform( reference_data[target_column]) array_prediction = reference_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot support bar metrics_matrix = metrics.classification_report( reference_data[target_column], prediction_labels, output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) z = metrics_frame.iloc[:-1, :-3].values x = prediction_column y = ['precision', 'recall', 'f1-score'] if len(prediction_column) > 2: roc_aucs = metrics.roc_auc_score(binaraized_target, array_prediction, average=None) z = np.append(z, [roc_aucs], axis=0) y.append('roc-auc') # change each element of z to type string for annotations z_text = [[str(round(y, 3)) for y in x] for x in z] # set up figure fig = ff.create_annotated_heatmap(z, y=y, x=x, annotation_text=z_text, colorscale='bluered', showscale=True) fig.update_layout(xaxis_title="Class", yaxis_title="Metric") metrics_matrix_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={ "data": metrics_matrix_json['data'], "layout": metrics_matrix_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None: #calculate corr ref_target_corr = reference_data[ num_feature_names + [target_column]].corr()[target_column] prod_target_corr = production_data[ num_feature_names + [target_column]].corr()[target_column] #plot output correlations target_corr = go.Figure() target_corr.add_trace( go.Bar(y=ref_target_corr, x=ref_target_corr.index, marker_color=grey, name='Reference')) target_corr.add_trace( go.Bar(y=prod_target_corr, x=ref_target_corr.index, marker_color=red, name='Production')) target_corr.update_layout(xaxis_title="Features", yaxis_title="Correlation", yaxis=dict(range=(-1, 1), showticklabels=True)) target_corr_json = json.loads(target_corr.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": target_corr_json['data'], "layout": target_corr_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) if target_column is not None and prediction_column is not None: error = reference_data[prediction_column] - reference_data[target_column] quantile_5 = np.quantile(error, .05) quantile_95 = np.quantile(error, .95) mae = np.mean(error) mae_under = np.mean(error[error <= quantile_5]) mae_exp = np.mean(error[(error > quantile_5) & (error < quantile_95)]) mae_over = np.mean(error[error >= quantile_95]) sd = np.std(error, ddof = 1) sd_under = np.std(error[error <= quantile_5], ddof = 1) sd_exp = np.std(error[(error > quantile_5) & (error < quantile_95)], ddof = 1) sd_over = np.std(error[error >= quantile_95], ddof = 1) self.wi = BaseWidgetInfo( title="Reference Data: Error Bias", type="counter", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "counters": [ { "value": str(round(mae, 2)) + " (" + str(round(sd,2)) + ")", "label": "Overall" }, { "value": str(round(mae_exp, 2)) + " (" + str(round(sd_exp,2)) + ")", "label": "Expected error" }, { "value": str(round(mae_under, 2)) + " (" + str(round(sd_under, 2)) + ")", "label": "Underestimation" }, { "value": str(round(mae_over, 2)) + " (" + str(round(sd_over, 2)) + ")", "label": "Overestimation" } ] }, additionalGraphs=[] ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if prediction_column is not None: #plot drift reference_mean = np.mean(reference_data[prediction_column]) reference_std = np.std(reference_data[prediction_column], ddof=1) x_title = "Timestamp" if date_column else "Index" pred_values = go.Figure() pred_values.add_trace( go.Scatter(x=reference_data[date_column] if date_column else reference_data.index, y=reference_data[prediction_column], mode='markers', name='Reference', marker=dict(size=6, color=grey))) pred_values.add_trace( go.Scatter(x=production_data[date_column] if date_column else production_data.index, y=production_data[prediction_column], mode='markers', name='Current', marker=dict(size=6, color=red))) pred_values.update_layout( xaxis_title=x_title, yaxis_title='Prediction Value', showlegend=True, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), shapes=[ dict( type="rect", # x-reference is assigned to the x-values xref="paper", # y-reference is assigned to the plot paper [0,1] yref="y", x0=0, y0=reference_mean - reference_std, x1=1, y1=reference_mean + reference_std, fillcolor="LightGreen", opacity=0.5, layer="below", line_width=0, ), dict( type="line", name='Reference', xref="paper", yref="y", x0=0, #min(testset_agg_by_date.index), y0=reference_mean, x1=1, #max(testset_agg_by_date.index), y1=reference_mean, line=dict(color="Green", width=3)), ]) pred_values_json = json.loads(pred_values.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": pred_values_json['data'], "layout": pred_values_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: if target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) prod_error = production_data[prediction_column] - production_data[target_column] prod_quntile_5 = np.quantile(prod_error, .05) prod_quntile_95 = np.quantile(prod_error, .95) production_data['dataset'] = 'Reference' production_data['Error bias'] = list(map(lambda x : 'Underestimation' if x <= prod_quntile_5 else 'Majority' if x < prod_quntile_95 else 'Overestimation', prod_error)) #plot output correlations pred_actual = go.Figure() pred_actual.add_trace(go.Scatter( x = production_data[production_data['Error bias'] == 'Underestimation'][target_column], y = production_data[production_data['Error bias'] == 'Underestimation'][prediction_column], mode = 'markers', name = 'Underestimation', marker = dict( color = '#6574f7', showscale = False ) )) pred_actual.add_trace(go.Scatter( x = production_data[production_data['Error bias'] == 'Overestimation'][target_column], y = production_data[production_data['Error bias'] == 'Overestimation'][prediction_column], mode = 'markers', name = 'Overestimation', marker = dict( color = '#ee5540', showscale = False ) )) pred_actual.add_trace(go.Scatter( x = production_data[production_data['Error bias'] == 'Majority'][target_column], y = production_data[production_data['Error bias'] == 'Majority'][prediction_column], mode = 'markers', name = 'Majority', marker = dict( color = '#1acc98', showscale = False ) )) pred_actual.update_layout( xaxis_title = "Actual value", yaxis_title = "Predicted value", xaxis = dict( showticklabels=True ), yaxis = dict( showticklabels=True ), ) pred_actual_json = json.loads(pred_actual.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": pred_actual_json['data'], "layout": pred_actual_json['layout'] }, additionalGraphs=[], ) else: self.wi = None else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: if target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #plot output correlations error_distr = go.Figure() error = production_data[prediction_column] - production_data[ target_column] error_distr.add_trace( go.Histogram(x=error, marker_color=red, name='error distribution', histnorm='percent')) error_distr.update_layout( xaxis_title="Error (Predicted - Actual)", yaxis_title="Percentage", ) error_distr_json = json.loads(error_distr.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": error_distr_json['data'], "layout": error_distr_json['layout'] }, additionalGraphs=[], ) else: self.wi = None else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None: #calculate output drift target_p_value = ks_2samp(reference_data[target_column], production_data[target_column])[1] target_sim_test = "detected" if target_p_value < 0.05 else "not detected" #plot output distributions target_distr = ff.create_distplot([ reference_data[target_column], production_data[target_column] ], ["Reference", "Current"], colors=[grey, red], show_rug=True) target_distr.update_layout(xaxis_title="Value", yaxis_title="Share", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)) target_drift_json = json.loads(target_distr.to_json()) self.wi = BaseWidgetInfo( title="Target Drift: " + target_sim_test + ", p_value=" + str(round(target_p_value, 6)), type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "data": target_drift_json['data'], "layout": target_drift_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] target_names = None num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if prediction_column is not None and target_column is not None: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = binaraizer.transform( reference_data[target_column]) if production_data is not None: ref_array_prediction = reference_data[ prediction_column].to_numpy() ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1) ref_prediction_labels = [ prediction_column[x] for x in ref_prediction_ids ] reference_data['prediction_labels'] = ref_prediction_labels prod_array_prediction = production_data[ prediction_column].to_numpy() prod_prediction_ids = np.argmax(prod_array_prediction, axis=-1) prod_prediction_labels = [ prediction_column[x] for x in prod_prediction_ids ] production_data['prediction_labels'] = prod_prediction_labels additional_graphs_data = [] params_data = [] for feature_name in num_feature_names + cat_feature_names: #add data for table in params labels = prediction_column params_data.append({ "details": { "parts": [{ "title": "All", "id": "All" }] + [{ "title": str(label), "id": feature_name + "_" + str(label) } for label in labels], "insights": [] }, "f1": feature_name }) #create confusion based plots reference_data['dataset'] = 'Reference' production_data['dataset'] = 'Production' merged_data = pd.concat([reference_data, production_data]) fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", histnorm='', category_orders={ "dataset": ["Reference", "Production"] }) fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( "All", { "data": fig_json['data'], "layout": fig_json['layout'] }, )) for label in labels: merged_data['Confusion'] = merged_data.apply(lambda x : 'TP' if (x['target'] == label and x['prediction_labels'] == label) else ('FP' if(x['target'] != label and x['prediction_labels'] == label) else \ ('FN' if (x['target'] == label and x['prediction_labels'] != label) else 'TN')), axis = 1) fig = px.histogram(merged_data, x=feature_name, color='Confusion', facet_col="dataset", histnorm='', category_orders={ "dataset": ["Reference", "Production"], "Confusion": ["TP", "TN", "FP", "FN"] }) fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( feature_name + "_" + str(label), { "data": fig_json['data'], "layout": fig_json['layout'] }, )) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min( len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }], "data": params_data }, additionalGraphs=additional_graphs_data) else: ref_array_prediction = reference_data[ prediction_column].to_numpy() ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1) ref_prediction_labels = [ prediction_column[x] for x in ref_prediction_ids ] reference_data['prediction_labels'] = ref_prediction_labels additional_graphs_data = [] params_data = [] for feature_name in num_feature_names + cat_feature_names: #add data for table in params labels = prediction_column params_data.append({ "details": { "parts": [{ "title": "All", "id": "All" }] + [{ "title": str(label), "id": feature_name + "_" + str(label) } for label in labels], "insights": [] }, "f1": feature_name }) #create confusion based plots fig = px.histogram(reference_data, x=feature_name, color=target_column, histnorm='') fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( "All", { "data": fig_json['data'], "layout": fig_json['layout'] }, )) for label in labels: reference_data['Confusion'] = reference_data.apply(lambda x : 'TP' if (x['target'] == label and x['prediction_labels'] == label) else ('FP' if(x['target'] != label and x['prediction_labels'] == label) else \ ('FN' if (x['target'] == label and x['prediction_labels'] != label) else 'TN')), axis = 1) fig = px.histogram(reference_data, x=feature_name, color='Confusion', histnorm='', category_orders={ "Confusion": ["TP", "TN", "FP", "FN"] }) fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( feature_name + "_" + str(label), { "data": fig_json['data'], "layout": fig_json['layout'] }, )) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min( len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }], "data": params_data }, additionalGraphs=additional_graphs_data) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if prediction_column is not None and target_column is not None: additional_graphs_data = [] params_data = [] for feature_name in num_feature_names + cat_feature_names: #add data for table in params params_data.append( { "details": { "parts": [ { "title": "Target", "id": feature_name + "_target_values" }, { "title": "Prediction", "id": feature_name + "_prediction_values" } ], "insights": [] }, "f1": feature_name } ) #create target plot reference_data['dataset'] = 'Reference' production_data['dataset'] = 'Production' merged_data = pd.concat([reference_data, production_data]) target_fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", category_orders={"dataset": ["Reference", "Production"]}) target_fig_json = json.loads(target_fig.to_json()) #create prediction plot pred_fig = px.histogram(merged_data, x=feature_name, color=prediction_column, facet_col="dataset", category_orders={"dataset": ["Reference", "Production"]}) pred_fig_json = json.loads(pred_fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_target_values', { "data" : target_fig_json['data'], "layout" : target_fig_json['layout'] }, ) ) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_prediction_values', { "data" : pred_fig_json['data'], "layout" : pred_fig_json['layout'] }, ) ) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [ { "title": "Feature", "field": "f1" } ], "data": params_data }, additionalGraphs=additional_graphs_data ) elif target_column is not None: additional_graphs_data = [] params_data = [] for feature_name in num_feature_names + cat_feature_names: #add data for table in params params_data.append( { "details": { "parts": [ { "title": "Target", "id": feature_name + "_target_values" } ], "insights": [] }, "f1": feature_name } ) #create target plot reference_data['dataset'] = 'Reference' production_data['dataset'] = 'Production' merged_data = pd.concat([reference_data, production_data]) target_fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", category_orders={"dataset": ["Reference", "Production"]}) target_fig_json = json.loads(target_fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_target_values', { "data" : target_fig_json['data'], "layout" : target_fig_json['layout'] }, ) ) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [ { "title": "Feature", "field": "f1" } ], "data": params_data }, additionalGraphs=additional_graphs_data ) elif prediction_column is not None: additional_graphs_data = [] params_data = [] for feature_name in num_feature_names + cat_feature_names: #add data for table in params params_data.append( { "details": { "parts": [ { "title": "Prediction", "id": feature_name + "_prediction_values" } ], "insights": [] }, "f1": feature_name } ) #create target plot reference_data['dataset'] = 'Reference' production_data['dataset'] = 'Production' merged_data = pd.concat([reference_data, production_data]) prediction_fig = px.histogram(merged_data, x=feature_name, color=prediction_column, facet_col="dataset", category_orders={"dataset": ["Reference", "Production"]}) prediction_fig_json = json.loads(prediction_fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_prediction_values', { "data" : prediction_fig_json['data'], "layout" : prediction_fig_json['layout'] }, ) ) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [ { "title": "Feature", "field": "f1" } ], "data": params_data }, additionalGraphs=additional_graphs_data ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #plot support bar metrics_matrix = metrics.classification_report( production_data[target_column], production_data[prediction_column], output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) support = metrics_frame.iloc[-1:, :-3].values[0] fig = go.Figure() fig.add_trace( go.Bar(x=target_names if target_names else metrics_frame.columns.tolist()[:-3], y=metrics_frame.iloc[-1:, :-3].values[0], marker_color=red, name='Support')) fig.update_layout( xaxis_title="Class", yaxis_title="Number of Objects", ) support_bar_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": support_bar_json['data'], "layout": support_bar_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pandas.DataFrame, production_data: pandas.DataFrame, _: Dict): self.wi = BaseWidgetInfo( type="big_graph", title=self.title, size=2, params={ "data": [{ "marker": { "color": "#ed0400" }, "type": "bar", "x": reference_data[0].tolist(), "y": reference_data[1].tolist() }], "layout": { "template": { "data": { "bar": [{ "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" }], "barpolar": [{ "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" }], "carpet": [{ "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" }], "choropleth": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" }], "contour": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "contour" }], "contourcarpet": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" }], "heatmap": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "heatmap" }], "heatmapgl": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "heatmapgl" }], "histogram": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" }], "histogram2d": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "histogram2d" }], "histogram2dcontour": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "histogram2dcontour" }], "mesh3d": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" }], "parcoords": [{ "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" }], "pie": [{ "automargin": True, "type": "pie" }], "scatter": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" }], "scatter3d": [{ "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" }], "scattercarpet": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" }], "scattergeo": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" }], "scattergl": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" }], "scattermapbox": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" }], "scatterpolar": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" }], "scatterpolargl": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" }], "scatterternary": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" }], "surface": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "surface" }], "table": [{ "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" }] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [[0, "#8e0152"], [0.1, "#c51b7d"], [0.2, "#de77ae"], [0.3, "#f1b6da"], [0.4, "#fde0ef"], [0.5, "#f7f7f7"], [0.6, "#e6f5d0"], [0.7, "#b8e186"], [0.8, "#7fbc41"], [0.9, "#4d9221"], [1, "#276419"]], "sequential": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "sequentialminus": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": True, "showland": True, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": True, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": True, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": True, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": True, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": True, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "xaxis": { "title": { "text": "Features" } }, "yaxis": { "range": [-1, 1], "showticklabels": True, "title": { "text": "Correlation" } } } }, alerts=[], insights=[], details="", alertsPosition="row", alertStats=AlertStats(), additionalGraphs=[], )
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) array_prediction = production_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot support bar graphs = [] for label in prediction_column: fig = go.Figure() fig.add_trace( go.Scatter(x=np.random.random(production_data[ production_data[target_column] == label].shape[0]), y=production_data[production_data[target_column] == label][label], mode='markers', name=str(label), marker=dict(size=6, color=red))) fig.add_trace( go.Scatter( x=np.random.random(production_data[ production_data[target_column] != label].shape[0]), y=production_data[ production_data[target_column] != label][label], mode='markers', name='other', marker=dict(size=6, color=grey))) fig.update_layout(yaxis_title="Probability", xaxis=dict(range=(-2, 3), showticklabels=False)) fig_json = json.loads(fig.to_json()) graphs.append({ "id": "tab_" + str(label), "title": str(label), "graph": { "data": fig_json["data"], "layout": fig_json["layout"], } }) self.wi = BaseWidgetInfo( title=self.title, type="tabbed_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={"graphs": graphs}, additionalGraphs=[], ) else: self.wi = None
def get_info(self) -> BaseWidgetInfo: return BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "columns": [{ "title": "Feature", "field": "f1" }, { "title": "Data drift", "field": "f2" }, { "title": "Distribution", "field": "f3", "type": "histogram", "options": { "xField": "x", "yField": "y" } }, { "title": "Distribution shift (similarity test at 95% confidence level)", "field": "f4" }, { "title": "Alerts", "field": "f5" }], "data": [{ "details": { "parts": [{ "title": "Data drift", "id": "season_drift" }, { "title": "Data distribution", "id": "season_distr" }], "insights": [] }, "f1": "season", "f2": "Detected", "f3": { "x": [0.0, 0.0, 0.0, 0.0, 0.0, 1000.0, 0.0, 0.0, 0.0, 0.0], "y": [ 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5 ] }, "f4": "Rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "holiday_drift" }, { "title": "Data distribution", "id": "holiday_distr" }], "insights": [] }, "f1": "holiday", "f2": "Not Detected", "f3": { "x": [976.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 24.0], "y": [ 0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0 ] }, "f4": "Not rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "workingday_drift" }, { "title": "Data distribution", "id": "workingday_distr" }], "insights": [] }, "f1": "workingday", "f2": "Not Detected", "f3": { "x": [312.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 688.0], "y": [ 0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0 ] }, "f4": "Not rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "weather_drift" }, { "title": "Data distribution", "id": "weather_distr" }], "insights": [] }, "f1": "weather", "f2": "Detected", "f3": { "x": [ 566.0, 0.0, 0.0, 0.0, 0.0, 382.0, 0.0, 0.0, 0.0, 52.0 ], "y": [ 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4000000000000004, 2.6, 2.8, 3.0 ] }, "f4": "Rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "temp_drift" }, { "title": "Data distribution", "id": "temp_distr" }], "insights": [] }, "f1": "temp", "f2": "Detected", "f3": { "x": [ 7.0, 55.0, 197.0, 182.0, 307.0, 93.0, 73.0, 46.0, 32.0, 8.0 ], "y": [ 6.56, 8.61, 10.66, 12.709999999999999, 14.759999999999998, 16.81, 18.86, 20.909999999999997, 22.959999999999997, 25.009999999999998, 27.06 ] }, "f4": "Rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "atemp_drift" }, { "title": "Data distribution", "id": "atemp_distr" }], "insights": [] }, "f1": "atemp", "f2": "Detected", "f3": { "x": [ 12.0, 84.0, 193.0, 237.0, 132.0, 183.0, 73.0, 61.0, 8.0, 17.0 ], "y": [ 9.09, 11.286999999999999, 13.484, 15.681000000000001, 17.878, 20.075, 22.272, 24.469, 26.666, 28.863, 31.06 ] }, "f4": "Rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "humidity_drift" }, { "title": "Data distribution", "id": "humidity_distr" }], "insights": [] }, "f1": "humidity", "f2": "Detected", "f3": { "x": [ 7.0, 5.0, 59.0, 144.0, 188.0, 180.0, 80.0, 149.0, 105.0, 83.0 ], "y": [ 16.0, 24.4, 32.8, 41.2, 49.6, 58.0, 66.4, 74.80000000000001, 83.2, 91.60000000000001, 100.0 ] }, "f4": "Rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "windspeed_drift" }, { "title": "Data distribution", "id": "windspeed_distr" }], "insights": [] }, "f1": "windspeed", "f2": "Not Detected", "f3": { "x": [ 117.0, 193.0, 201.0, 271.0, 112.0, 57.0, 39.0, 9.0, 0.0, 1.0 ], "y": [ 0.0, 4.30006, 8.60012, 12.90018, 17.20024, 21.500300000000003, 25.80036, 30.10042, 34.40048, 38.700540000000004, 43.0006 ] }, "f4": "Not rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "month_drift" }, { "title": "Data distribution", "id": "month_distr" }], "insights": [] }, "f1": "month", "f2": "Detected", "f3": { "x": [ 89.0, 0.0, 0.0, 0.0, 0.0, 455.0, 0.0, 0.0, 0.0, 456.0 ], "y": [ 10.0, 10.2, 10.4, 10.6, 10.8, 11.0, 11.2, 11.4, 11.6, 11.8, 12.0 ] }, "f4": "Rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "hour_drift" }, { "title": "Data distribution", "id": "hour_distr" }], "insights": [] }, "f1": "hour", "f2": "Not Detected", "f3": { "x": [ 123.0, 81.0, 82.0, 126.0, 84.0, 84.0, 126.0, 84.0, 84.0, 126.0 ], "y": [ 0.0, 2.3, 4.6, 6.8999999999999995, 9.2, 11.5, 13.799999999999999, 16.099999999999998, 18.4, 20.7, 23.0 ] }, "f4": "Not rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "year_drift" }, { "title": "Data distribution", "id": "year_distr" }], "insights": [] }, "f1": "year", "f2": "Detected", "f3": { "x": [0.0, 0.0, 0.0, 0.0, 0.0, 1000.0, 0.0, 0.0, 0.0, 0.0], "y": [ 2011.5, 2011.6, 2011.7, 2011.8, 2011.9, 2012.0, 2012.1, 2012.2, 2012.3, 2012.4, 2012.5 ] }, "f4": "Rejected", "f5": " " }, { "details": { "parts": [{ "title": "Data drift", "id": "week_day_drift" }, { "title": "Data distribution", "id": "week_day_distr" }], "insights": [] }, "f1": "week_day", "f2": "Not Detected", "f3": { "x": [ 144.0, 137.0, 0.0, 144.0, 0.0, 143.0, 144.0, 0.0, 144.0, 144.0 ], "y": [ 1.0, 1.6, 2.2, 2.8, 3.4, 4.0, 4.6, 5.2, 5.8, 6.3999999999999995, 7.0 ] }, "f4": "Not rejected", "f5": " " }] }, additionalGraphs=[ AdditionalGraphInfo( "holiday_drift", { "data": [{ "marker": { "color": "#4d4d4d", "size": 6 }, "mode": "markers", "name": "Production", "type": "scatter", "x": [ "2012-10-16", "2012-10-17", "2012-10-18", "2012-10-19", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04", "2012-11-05", "2012-11-06", "2012-11-07", "2012-11-08", "2012-11-09", "2012-11-10", "2012-11-11", "2012-11-12", "2012-11-13", "2012-11-14", "2012-11-15", "2012-11-16", "2012-11-17", "2012-11-18", "2012-11-19", "2012-12-01", "2012-12-02", "2012-12-03", "2012-12-04", "2012-12-05", "2012-12-06", "2012-12-07", "2012-12-08", "2012-12-09", "2012-12-10", "2012-12-11", "2012-12-12", "2012-12-13", "2012-12-14", "2012-12-15", "2012-12-16", "2012-12-17", "2012-12-18", "2012-12-19" ], "y": [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] }], "layout": { "legend": { "orientation": "h", "x": 1, "xanchor": "right", "y": 1.02, "yanchor": "bottom" }, "shapes": [{ "fillcolor": "LightGreen", "layer": "below", "line": { "width": 0 }, "opacity": 0.5, "type": "rect", "x0": 0, "x1": 1, "xref": "paper", "y0": -0.13886233657597655, "y1": 0.19692424230124458, "yref": "y" }, { "line": { "color": "Green", "width": 3 }, "name": "Reference", "type": "line", "x0": 0, "x1": 1, "xref": "paper", "y0": 0.02903095286263403, "y1": 0.02903095286263403, "yref": "y" }], "showlegend": True, "template": { "data": { "bar": [{ "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" }], "barpolar": [{ "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" }], "carpet": [{ "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" }], "choropleth": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" }], "contour": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "contour" }], "contourcarpet": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" }], "heatmap": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "heatmap" }], "heatmapgl": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "heatmapgl" }], "histogram": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" }], "histogram2d": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "histogram2d" }], "histogram2dcontour": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "histogram2dcontour" }], "mesh3d": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" }], "parcoords": [{ "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" }], "pie": [{ "automargin": True, "type": "pie" }], "scatter": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" }], "scatter3d": [{ "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" }], "scattercarpet": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" }], "scattergeo": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" }], "scattergl": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" }], "scattermapbox": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" }], "scatterpolar": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" }], "scatterpolargl": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" }], "scatterternary": [{ "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" }], "surface": [{ "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "type": "surface" }], "table": [{ "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" }] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [[0, "#8e0152"], [0.1, "#c51b7d"], [0.2, "#de77ae"], [0.3, "#f1b6da"], [0.4, "#fde0ef"], [0.5, "#f7f7f7"], [0.6, "#e6f5d0"], [0.7, "#b8e186"], [0.8, "#7fbc41"], [0.9, "#4d9221"], [1, "#276419"]], "sequential": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]], "sequentialminus": [[0.0, "#0d0887"], [0.1111111111111111, "#46039f"], [0.2222222222222222, "#7201a8"], [0.3333333333333333, "#9c179e"], [0.4444444444444444, "#bd3786"], [0.5555555555555556, "#d8576b"], [0.6666666666666666, "#ed7953"], [0.7777777777777778, "#fb9f3a"], [0.8888888888888888, "#fdca26"], [1.0, "#f0f921"]] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": True, "showland": True, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": True, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": True, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": True, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": True, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": True, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "xaxis": { "title": { "text": "Timestamp" } }, "yaxis": { "title": { "text": "holiday" } } } }) ], )
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] target_names = None num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if prediction_column is not None and target_column is not None: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = binaraizer.transform(reference_data[target_column]) if production_data is not None: ref_array_prediction = reference_data[prediction_column].to_numpy() ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1) ref_prediction_labels = [prediction_column[x] for x in ref_prediction_ids] reference_data['prediction_labels'] = ref_prediction_labels prod_array_prediction = production_data[prediction_column].to_numpy() prod_prediction_ids = np.argmax(prod_array_prediction, axis=-1) prod_prediction_labels = [prediction_column[x] for x in prod_prediction_ids] production_data['prediction_labels'] = prod_prediction_labels additional_graphs_data = [] params_data = [] for feature_name in num_feature_names + cat_feature_names: #add data for table in params labels = prediction_column params_data.append( { "details": { "parts": [{"title":"All", "id":"All" + "_" + str(feature_name)}] + [{"title":str(label), "id": feature_name + "_" + str(label)} for label in labels], "insights": [] }, "f1": feature_name } ) #create confusion based plots reference_data['dataset'] = 'Reference' production_data['dataset'] = 'Current' merged_data = pd.concat([reference_data, production_data]) fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", histnorm = '', category_orders={"dataset": ["Reference", "Current"]}) fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( "All" + "_" + str(feature_name), { "data" : fig_json['data'], "layout" : fig_json['layout'] }, ) ) for label in labels: fig = make_subplots(rows=1, cols=2, subplot_titles=("Reference", "Current")) #REF fig.add_trace(go.Scatter( x = reference_data[reference_data[target_column] == label][feature_name], y = reference_data[reference_data[target_column] == label][label], mode = 'markers', name = str(label) + ' (ref)', marker=dict( size=6, color=red ) ), row=1, col=1 ) fig.add_trace(go.Scatter( x = reference_data[reference_data[target_column] != label][feature_name], y = reference_data[reference_data[target_column] != label][label], mode = 'markers', name = 'other (ref)', marker=dict( size=6, color=grey ) ), row=1, col=1 ) fig.update_layout( xaxis_title=feature_name, yaxis_title='Probability', xaxis = dict( showticklabels=True ), yaxis = dict( range=(0, 1), showticklabels=True ) ) #PROD Prediction fig.add_trace(go.Scatter( x = production_data[production_data[target_column] == label][feature_name], y = production_data[production_data[target_column] == label][label], mode = 'markers', name = str(label) + ' (curr)', marker=dict( size=6, color=red #set color equal to a variable ) ), row=1, col=2 ) fig.add_trace(go.Scatter( x = production_data[production_data[target_column] != label][feature_name], y = production_data[production_data[target_column] != label][label], mode = 'markers', name = 'other (curr)', marker=dict( size=6, color=grey #set color equal to a variable ) ), row=1, col=2 ) fig.update_layout( xaxis_title=feature_name, yaxis_title='Probability', xaxis = dict( showticklabels=True ), yaxis = dict( range=(0, 1), showticklabels=True ) ) # Update xaxis properties fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=1) fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=2) # Update yaxis properties fig.update_yaxes(title_text="Probability", showgrid=True, row=1, col=1) fig.update_yaxes(title_text="Probability", showgrid=True, row=1, col=2) fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( feature_name + "_" + str(label), { "data" : fig_json['data'], "layout" : fig_json['layout'] }, ) ) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [ { "title": "Feature", "field": "f1" } ], "data": params_data }, additionalGraphs=additional_graphs_data ) else: ref_array_prediction = reference_data[prediction_column].to_numpy() ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1) ref_prediction_labels = [prediction_column[x] for x in ref_prediction_ids] reference_data['prediction_labels'] = ref_prediction_labels additional_graphs_data = [] params_data = [] for feature_name in num_feature_names + cat_feature_names: #add data for table in params labels = prediction_column params_data.append( { "details": { "parts": [{"title":"All", "id":"All" + "_" + str(feature_name)}] + [{"title":str(label), "id": feature_name + "_" + str(label)} for label in labels], "insights": [] }, "f1": feature_name } ) #create confusion based plots fig = px.histogram(reference_data, x=feature_name, color=target_column, histnorm = '') fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( "All" + "_" + str(feature_name), { "data" : fig_json['data'], "layout" : fig_json['layout'] }, ) ) for label in labels: fig = go.Figure() fig.add_trace(go.Scatter( x = reference_data[reference_data[target_column] == label][feature_name], y = reference_data[reference_data[target_column] == label][label], mode = 'markers', name = str(label), marker=dict( size=6, color=red #set color equal to a variable ) )) fig.add_trace(go.Scatter( x = reference_data[reference_data[target_column] != label][feature_name], y = reference_data[reference_data[target_column] != label][label], mode = 'markers', name = 'other', marker=dict( size=6, color=grey ) )) fig.update_layout( xaxis_title=feature_name, yaxis_title='Probability', xaxis = dict( showticklabels=True ), yaxis = dict( range=(0, 1), showticklabels=True ) ) fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo( feature_name + "_" + str(label), { "data" : fig_json['data'], "layout" : fig_json['layout'] }, ) ) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage" : min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [ { "title": "Feature", "field": "f1" } ], "data": params_data }, additionalGraphs=additional_graphs_data ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) array_prediction = reference_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot support bar graphs = [] for label in prediction_column: pred_distr = ff.create_distplot( [ reference_data[reference_data[target_column] == label][label], reference_data[reference_data[target_column] != label][label] ], [str(label), "other"], colors=[red, grey], bin_size = 0.05, show_curve = False, show_rug=True ) pred_distr.update_layout( xaxis_title = "Probability", yaxis_title = "Share", legend = dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ) ) pred_distr_json = json.loads(pred_distr.to_json()) graphs.append({ "id": "tab_" + str(label), "title": str(label), "graph":{ "data":pred_distr_json["data"], "layout":pred_distr_json["layout"], } }) self.wi = BaseWidgetInfo( title=self.title, type="tabbed_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={ "graphs": graphs }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None: #calculate output drift ref_feature_vc = reference_data[target_column][np.isfinite( reference_data[target_column])].value_counts() prod_feature_vc = production_data[target_column][np.isfinite( production_data[target_column])].value_counts() keys = set( list(reference_data[target_column][np.isfinite( reference_data[target_column])].unique()) + list(production_data[target_column][np.isfinite( production_data[target_column])].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item prod_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): prod_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] f_obs = [value[1] for value in sorted(prod_feature_dict.items())] target_p_value = chisquare(f_exp, f_obs)[1] target_sim_test = "detected" if target_p_value < 0.05 else "not detected" #plot output distributions fig = go.Figure() fig.add_trace( go.Histogram(x=reference_data[target_column], marker_color=grey, opacity=0.6, nbinsx=10, name='Reference', histnorm='probability')) fig.add_trace( go.Histogram(x=production_data[target_column], marker_color=red, opacity=0.6, nbinsx=10, name='Production', histnorm='probability')) fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), xaxis_title=target_column, yaxis_title="Share") target_drift_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title="Target Drift: " + target_sim_test + ", p_value=" + str(round(target_p_value, 6)), type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "data": target_drift_json['data'], "layout": target_drift_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #set params data params_data = [] drifted_fetures_count = 0 #plt.ioff() for feature_name in num_feature_names: # + cat_feature_names: #feature_names: prod_small_hist = np.histogram( production_data[feature_name][np.isfinite( production_data[feature_name])], bins=10, density=True) ref_small_hist = np.histogram( reference_data[feature_name][np.isfinite( reference_data[feature_name])], bins=10, density=True) feature_type = 'num' p_value = ks_2samp(reference_data[feature_name], production_data[feature_name])[1] distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected" drifted_fetures_count += 1 if p_value < 0.05 else 0 params_data.append({ "details": { "parts": [{ "title": "Data drift", "id": feature_name + "_drift", "type": "widget" }, { "title": "Data distribution", "id": feature_name + "_distr" }], "insights": [] }, "f1": feature_name, "f6": feature_type, "f3": { "x": list(ref_small_hist[1]), "y": list(ref_small_hist[0]) }, "f4": { "x": list(prod_small_hist[1]), "y": list(prod_small_hist[0]) }, "f2": distr_sim_test, "f5": round(p_value, 6) }) for feature_name in cat_feature_names: #feature_names: prod_small_hist = np.histogram( production_data[feature_name][np.isfinite( production_data[feature_name])], bins=10, density=True) ref_small_hist = np.histogram( reference_data[feature_name][np.isfinite( reference_data[feature_name])], bins=10, density=True) feature_type = 'cat' #p_value = ks_2samp(reference_data[feature_name], production_data[feature_name])[1] #CHI2 to be implemented for cases with different categories ref_feature_vc = reference_data[feature_name][np.isfinite( reference_data[feature_name])].value_counts() prod_feature_vc = production_data[feature_name][np.isfinite( production_data[feature_name])].value_counts() keys = set( list(reference_data[feature_name][np.isfinite( reference_data[feature_name])].unique()) + list(production_data[feature_name][np.isfinite( production_data[feature_name])].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item prod_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): prod_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] f_obs = [value[1] for value in sorted(prod_feature_dict.items())] p_value = chisquare(f_exp, f_obs)[1] distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected" drifted_fetures_count += 1 if p_value < 0.05 else 0 params_data.append({ "details": { "parts": [{ "title": "Data drift", "id": feature_name + "_drift", "type": "widget" }, { "title": "Data distribution", "id": feature_name + "_distr" }], "insights": [] }, "f1": feature_name, "f6": feature_type, "f3": { "x": list(ref_small_hist[1]), "y": list(ref_small_hist[0]) }, "f4": { "x": list(prod_small_hist[1]), "y": list(prod_small_hist[0]) }, "f2": distr_sim_test, "f5": round(p_value, 6) }) #set additionalGraphs additional_graphs_data = [] for feature_name in num_feature_names + cat_feature_names: #feature_names: #plot distributions fig = go.Figure() fig.add_trace( go.Histogram(x=reference_data[feature_name], marker_color=grey, opacity=0.6, nbinsx=10, name='Reference', histnorm='probability')) fig.add_trace( go.Histogram(x=production_data[feature_name], marker_color=red, opacity=0.6, nbinsx=10, name='Current', histnorm='probability')) fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), xaxis_title=feature_name, yaxis_title="Share") distr_figure = json.loads(fig.to_json()) #plot drift reference_mean = np.mean(reference_data[feature_name][np.isfinite( reference_data[feature_name])]) reference_std = np.std(reference_data[feature_name][np.isfinite( reference_data[feature_name])], ddof=1) x_title = "Timestamp" if date_column else "Index" fig = go.Figure() fig.add_trace( go.Scatter(x=production_data[date_column] if date_column else production_data.index, y=production_data[feature_name], mode='markers', name='Current', marker=dict(size=6, color=grey))) fig.update_layout( xaxis_title=x_title, yaxis_title=feature_name, showlegend=True, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), shapes=[ dict( type="rect", # x-reference is assigned to the x-values xref="paper", # y-reference is assigned to the plot paper [0,1] yref="y", x0=0, y0=reference_mean - reference_std, x1=1, y1=reference_mean + reference_std, fillcolor="LightGreen", opacity=0.5, layer="below", line_width=0, ), dict( type="line", name='Reference', xref="paper", yref="y", x0=0, #min(testset_agg_by_date.index), y0=reference_mean, x1=1, #max(testset_agg_by_date.index), y1=reference_mean, line=dict(color="Green", width=3)), ]) drift_figure = json.loads(fig.to_json()) #add distributions data additional_graphs_data.append( AdditionalGraphInfo(feature_name + '_distr', { "data": distr_figure['data'], "layout": distr_figure['layout'] })) #add drift data additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_drift', { "title": "", "size": 2, "text": "", "type": "big_graph", "params": { "data": drift_figure['data'], "layout": drift_figure['layout'] } })) self.wi = BaseWidgetInfo( title="Data Drift: drift detected for " + str(drifted_fetures_count) + " out of " + str(len(num_feature_names) + len(cat_feature_names)) + " features", type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }, { "title": "Type", "field": "f6" }, { "title": "Reference Distribution", "field": "f3", "type": "histogram", "options": { "xField": "x", "yField": "y" } }, { "title": "Current Distribution", "field": "f4", "type": "histogram", "options": { "xField": "x", "yField": "y" } }, { "title": "Data drift", "field": "f2" }, { "title": "P-Value for Similarity Test", "field": "f5", "sort": "asc" }], "data": params_data }, additionalGraphs=additional_graphs_data)
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) ref_error = reference_data[prediction_column] - reference_data[ target_column] prod_error = production_data[prediction_column] - production_data[ target_column] ref_quntile_5 = np.quantile(ref_error, .05) ref_quntile_95 = np.quantile(ref_error, .95) prod_quntile_5 = np.quantile(prod_error, .05) prod_quntile_95 = np.quantile(prod_error, .95) #create subplots reference_data['dataset'] = 'Reference' reference_data['Error bias'] = list( map( lambda x: 'Underestimation' if x <= ref_quntile_5 else 'Majority' if x < ref_quntile_95 else 'Overestimation', ref_error)) production_data['dataset'] = 'Current' production_data['Error bias'] = list( map( lambda x: 'Underestimation' if x <= prod_quntile_5 else 'Majority' if x < prod_quntile_95 else 'Overestimation', prod_error)) merged_data = pd.concat([reference_data, production_data]) reference_data.drop(['dataset', 'Error bias'], axis=1, inplace=True) production_data.drop(['dataset', 'Error bias'], axis=1, inplace=True) params_data = [] additional_graphs_data = [] for feature_name in num_feature_names: feature_type = 'num' ref_overal_value = np.mean(reference_data[feature_name]) ref_under_value = np.mean( reference_data[ref_error <= ref_quntile_5][feature_name]) ref_expected_value = np.mean( reference_data[(ref_error > ref_quntile_5) & (ref_error < ref_quntile_95)][feature_name]) ref_over_value = np.mean( reference_data[ref_error >= ref_quntile_95][feature_name]) ref_range_value = 0 if ref_over_value == ref_under_value else 100 * abs( ref_over_value - ref_under_value) / (np.max(reference_data[feature_name]) - np.min(reference_data[feature_name])) prod_overal_value = np.mean(production_data[feature_name]) prod_under_value = np.mean(production_data[ prod_error <= prod_quntile_5][feature_name]) prod_expected_value = np.mean(production_data[ (prod_error > prod_quntile_5) & (prod_error < prod_quntile_95)][feature_name]) prod_over_value = np.mean(production_data[ prod_error >= prod_quntile_95][feature_name]) prod_range_value = 0 if prod_over_value == prod_under_value else 100 * abs( prod_over_value - prod_under_value) / ( np.max(production_data[feature_name]) - np.min(production_data[feature_name])) feature_hist = px.histogram( merged_data, x=feature_name, color='Error bias', facet_col="dataset", histnorm='percent', barmode='overlay', category_orders={ "dataset": ["Reference", "Current"], "Error bias": ["Underestimation", "Overestimation", "Majority"] }) feature_hist_json = json.loads(feature_hist.to_json()) params_data.append({ "details": { "parts": [{ "title": "Error bias", "id": feature_name + "_hist" }], "insights": [] }, "f1": feature_name, "f2": feature_type, "f3": round(ref_expected_value, 2), "f4": round(ref_under_value, 2), "f5": round(ref_over_value, 2), "f6": round(ref_range_value, 2), "f7": round(prod_expected_value, 2), "f8": round(prod_under_value, 2), "f9": round(prod_over_value, 2), "f10": round(prod_range_value, 2) }) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_hist', { "data": feature_hist_json['data'], "layout": feature_hist_json['layout'] })) for feature_name in cat_feature_names: feature_type = 'cat' ref_overal_value = reference_data[feature_name].value_counts( ).idxmax() ref_under_value = reference_data[ref_error <= ref_quntile_5][ feature_name].value_counts().idxmax() #ref_expected_value = reference_data[(ref_error > ref_quntile_5) & (ref_error < ref_quntile_95)][feature_name].value_counts().idxmax() ref_over_value = reference_data[ref_error >= ref_quntile_95][ feature_name].value_counts().idxmax() ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \ or (ref_under_value != ref_overal_value) else 0 prod_overal_value = production_data[feature_name].value_counts( ).idxmax() prod_under_value = production_data[ prod_error <= prod_quntile_5][feature_name].value_counts( ).idxmax() #prod_expected_value = production_data[(prod_error > prod_quntile_5) & (prod_error < prod_quntile_95)][feature_name].value_counts().idxmax() prod_over_value = production_data[ prod_error >= prod_quntile_95][feature_name].value_counts( ).idxmax() prod_range_value = 1 if (prod_overal_value != prod_under_value) or (prod_over_value != prod_overal_value) \ or (prod_under_value != prod_overal_value) else 0 feature_hist = px.histogram( merged_data, x=feature_name, color='Error bias', facet_col="dataset", histnorm='percent', barmode='overlay', category_orders={ "dataset": ["Reference", "Current"], "Error bias": ["Underestimation", "Overestimation", "Majority"] }) feature_hist_json = json.loads(feature_hist.to_json()) params_data.append({ "details": { "parts": [{ "title": "Error bias", "id": feature_name + "_hist" }], "insights": [] }, "f1": feature_name, "f2": feature_type, "f3": str(ref_overal_value), "f4": str(ref_under_value), "f5": str(ref_over_value), "f6": str(ref_range_value), "f7": str(prod_overal_value), "f8": str(prod_under_value), "f9": str(prod_over_value), "f10": int(prod_range_value) }) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_hist', { "data": feature_hist_json['data'], "layout": feature_hist_json['layout'] })) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }, { "title": "Type", "field": "f2" }, { "title": "REF: Majority", "field": "f3" }, { "title": "REF: Under", "field": "f4" }, { "title": "REF: Over", "field": "f5" }, { "title": "REF: Range(%)", "field": "f6" }, { "title": "CURR: Majority", "field": "f7" }, { "title": "CURR: Under", "field": "f8" }, { "title": "CURR: Over", "field": "f9" }, { "title": "CURR: Range(%)", "field": "f10", "sort": "desc" }], "data": params_data }, additionalGraphs=additional_graphs_data) else: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) error = reference_data[prediction_column] - reference_data[ target_column] quntile_5 = np.quantile(error, .05) quntile_95 = np.quantile(error, .95) reference_data['Error bias'] = reference_data['Error bias'] = list( map( lambda x: 'Underestimation' if x <= quntile_5 else 'Majority' if x < quntile_95 else 'Overestimation', error)) params_data = [] additional_graphs_data = [] for feature_name in num_feature_names: # + cat_feature_names: #feature_names: feature_type = 'num' ref_overal_value = np.mean(reference_data[feature_name]) ref_under_value = np.mean( reference_data[error <= quntile_5][feature_name]) #ref_expected_value = np.mean(reference_data[(error > quntile_5) & (error < quntile_95)][feature_name]) ref_over_value = np.mean( reference_data[error >= quntile_95][feature_name]) ref_range_value = 0 if ref_over_value == ref_under_value else 100 * abs( ref_over_value - ref_under_value) / (np.max(reference_data[feature_name]) - np.min(reference_data[feature_name])) hist = px.histogram( reference_data, x=feature_name, color='Error bias', histnorm='percent', barmode='overlay', category_orders={ "Error bias": ["Underestimation", "Overestimation", "Majority"] }) #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset", # category_orders={"dataset": ["Reference", "Production"]}) hist_figure = json.loads(hist.to_json()) params_data.append({ "details": { "parts": [{ "title": "Error bias", "id": feature_name + "_hist" }], "insights": [] }, "f1": feature_name, "f2": feature_type, "f3": round(ref_overal_value, 2), "f4": round(ref_under_value, 2), "f5": round(ref_over_value, 2), "f6": round(ref_range_value, 2) }) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_hist', { "data": hist_figure['data'], "layout": hist_figure['layout'] })) for feature_name in cat_feature_names: #feature_names: feature_type = 'cat' ref_overal_value = reference_data[feature_name].value_counts( ).idxmax() ref_under_value = reference_data[ error <= quntile_5][feature_name].value_counts().idxmax() #ref_expected_value = reference_data[(error > quntile_5) & (error < quntile_95)][feature_name].value_counts().idxmax() ref_over_value = reference_data[ error >= quntile_95][feature_name].value_counts().idxmax() ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \ or (ref_under_value != ref_overal_value) else 0 hist = px.histogram( reference_data, x=feature_name, color='Error bias', histnorm='percent', barmode='overlay', category_orders={ "Error bias": ["Underestimation", "Overestimation", "Majority"] }) #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset", # category_orders={"dataset": ["Reference", "Production"]}) hist_figure = json.loads(hist.to_json()) params_data.append({ "details": { "parts": [{ "title": "Error bias", "id": feature_name + "_hist" }], "insights": [] }, "f1": feature_name, "f2": feature_type, "f3": str(ref_overal_value), "f4": str(ref_under_value), "f5": str(ref_over_value), "f6": int(ref_range_value) }) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_hist', { "data": hist_figure['data'], "layout": hist_figure['layout'] })) reference_data.drop('Error bias', axis=1, inplace=True) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }, { "title": "Type", "field": "f2" }, { "title": "Majority", "field": "f3" }, { "title": "Underestimation", "field": "f4" }, { "title": "Overestimation", "field": "f5" }, { "title": "Range(%)", "field": "f6", "sort": "desc" }], "data": params_data }, additionalGraphs=additional_graphs_data)
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: if target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #plot output correlations pred_actual_time = go.Figure() error_trace = go.Scatter( x=production_data[date_column] if date_column else production_data.index, y=production_data[prediction_column] - production_data[target_column], mode='lines', name='Predicted - Actual', marker=dict(size=6, color=red)) zero_trace = go.Scatter( x=production_data[date_column] if date_column else production_data.index, y=[0] * production_data.shape[0], mode='lines', opacity=0.5, marker=dict( size=6, color='green', ), showlegend=False, ) pred_actual_time.add_trace(error_trace) pred_actual_time.add_trace(zero_trace) pred_actual_time.update_layout( xaxis_title="Timestamp" if date_column else "Index", yaxis_title="Error", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)) pred_actual_time_json = json.loads(pred_actual_time.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": pred_actual_time_json['data'], "layout": pred_actual_time_json['layout'] }, additionalGraphs=[], ) else: self.wi = None else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #array_prediction = reference_data[prediction_column].to_numpy() #prediction_ids = np.argmax(array_prediction, axis=-1) #prediction_labels = [prediction_column[x] for x in prediction_ids] if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(production_data[target_column])) binaraized_target.columns = ['target'] params_data = [] step_size = 0.05 binded = list( zip(binaraized_target['target'].tolist(), production_data[prediction_column[0]].tolist())) binded.sort(key=lambda item: item[1], reverse=True) data_size = len(binded) target_class_size = sum([x[0] for x in binded]) #result = pd.DataFrame(columns = ['Top(%)', 'Count', 'TP', 'FP', 'precision', 'recall']) offset = max(round(data_size * step_size), 1) for step in np.arange(offset, data_size + offset, offset): count = min(step, data_size) prob = round(binded[min(step, data_size - 1)][1], 2) top = round(100.0 * min(step, data_size) / data_size, 1) tp = sum([x[0] for x in binded[:min(step, data_size)]]) fp = count - tp precision = round(100.0 * tp / count, 1) recall = round(100.0 * tp / target_class_size, 1) params_data.append({ 'f1': float(top), 'f2': int(count), 'f3': float(prob), 'f4': int(tp), 'f5': int(fp), 'f6': float(precision), 'f7': float(recall) }) self.wi = BaseWidgetInfo(title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "rowsPerPage": 21, "columns": [{ "title": "Top(%)", "field": "f1", "sort": "asc" }, { "title": "Count", "field": "f2", }, { "title": "Prob", "field": "f3", }, { "title": "TP", "field": "f4" }, { "title": "FP", "field": "f5" }, { "title": "Precision", "field": "f6" }, { "title": "Recall", "field": "f7" }], "data": params_data }, additionalGraphs=[]) else: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(production_data[target_column])) binaraized_target.columns = prediction_column #create tables tabs = [] for label in prediction_column: params_data = [] step_size = 0.05 binded = list( zip(binaraized_target[label].tolist(), production_data[label].tolist())) binded.sort(key=lambda item: item[1], reverse=True) data_size = len(binded) target_class_size = sum([x[0] for x in binded]) #result = pd.DataFrame(columns = ['Top(%)', 'Count', 'TP', 'FP', 'precision', 'recall']) offset = max(round(data_size * step_size), 1) for step in np.arange(offset, data_size + offset, offset): count = min(step, data_size) prob = round(binded[min(step, data_size - 1)][1], 2) top = round(100.0 * min(step, data_size) / data_size, 1) tp = sum([x[0] for x in binded[:min(step, data_size)]]) fp = count - tp precision = round(100.0 * tp / count, 1) recall = round(100.0 * tp / target_class_size, 1) params_data.append({ 'f1': float(top), 'f2': int(count), 'f3': float(prob), 'f4': int(tp), 'f5': int(fp), 'f6': float(precision), 'f7': float(recall) }) tabs.append( TabInfo(id=label, title=label, widget=BaseWidgetInfo(title="", type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": 21, "columns": [{ "title": "Top(%)", "field": "f1", "sort": "asc" }, { "title": "Count", "field": "f2", }, { "title": "Prob", "field": "f3", }, { "title": "TP", "field": "f4" }, { "title": "FP", "field": "f5" }, { "title": "Precision", "field": "f6" }, { "title": "Recall", "field": "f7" }], "data": params_data }, additionalGraphs=[]))) self.wi = BaseWidgetInfo(type="tabs", title=self.title, size=1, details="", tabs=tabs) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #plot support bar metrics_matrix = metrics.classification_report( production_data[target_column], production_data[prediction_column], output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) z = metrics_frame.iloc[:-1, :-3].values x = target_names if target_names else metrics_frame.columns.tolist( )[:-3] y = ['precision', 'recall', 'f1-score'] # change each element of z to type string for annotations z_text = [[str(round(y, 3)) for y in x] for x in z] # set up figure fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='bluered', showscale=True) fig.update_layout(xaxis_title="Class", yaxis_title="Metric") metrics_matrix_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": metrics_matrix_json['data'], "layout": metrics_matrix_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if prediction_column is not None or target_column is not None: additional_graphs_data = [] params_data = [] for feature_name in num_feature_names + cat_feature_names: #add data for table in params params_data.append({ "details": { "parts": [{ "title": "Feature values", "id": feature_name + "_values" }], "insights": [] }, "f1": feature_name }) #create plot fig = make_subplots(rows=1, cols=2, subplot_titles=("Reference", "Production")) if prediction_column is not None: fig.add_trace(go.Scatter( x=reference_data[feature_name], y=reference_data[prediction_column], mode='markers', name='Prediction (ref)', marker=dict(size=6, color=grey)), row=1, col=1) if target_column is not None: fig.add_trace(go.Scatter(x=reference_data[feature_name], y=reference_data[target_column], mode='markers', name='Target (ref)', marker=dict(size=6, color=red)), row=1, col=1) if prediction_column is not None: fig.add_trace(go.Scatter( x=production_data[feature_name], y=production_data[prediction_column], mode='markers', name='Prediction (prod)', marker=dict(size=6, color=grey)), row=1, col=2) if target_column is not None: fig.add_trace(go.Scatter(x=production_data[feature_name], y=production_data[target_column], mode='markers', name='Target (prod)', marker=dict(size=6, color=red)), row=1, col=2) # Update xaxis properties fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=1) fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=2) # Update yaxis properties fig.update_yaxes(title_text="Value", showgrid=True, row=1, col=1) fig.update_yaxes(title_text="Value", showgrid=True, row=1, col=2) fig_json = json.loads(fig.to_json()) #write plot data in table as additional data additional_graphs_data.append( AdditionalGraphInfo(feature_name + '_values', { "data": fig_json['data'], "layout": fig_json['layout'] })) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }], "data": params_data }, additionalGraphs=additional_graphs_data) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = binaraizer.transform(reference_data[target_column]) array_prediction = reference_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] labels = sorted(set(reference_data[target_column])) #calculate quality metrics if len(prediction_column) > 2: roc_auc = metrics.roc_auc_score(binaraized_target, array_prediction, average='macro') log_loss = metrics.log_loss(binaraized_target, array_prediction) else: roc_auc = metrics.roc_auc_score(binaraized_target, reference_data[prediction_column[0]], #problem!!! average='macro') log_loss = metrics.log_loss(binaraized_target, reference_data[prediction_column[0]]) #problem!!! accuracy_score = metrics.accuracy_score(reference_data[target_column], prediction_labels) avg_precision = metrics.precision_score(reference_data[target_column], prediction_labels, average='macro') avg_recall = metrics.recall_score(reference_data[target_column], prediction_labels, average='macro') avg_f1 = metrics.f1_score(reference_data[target_column], prediction_labels, average='macro') self.wi = BaseWidgetInfo( title=self.title, type="counter", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "counters": [ { "value": str(round(accuracy_score, 3)), "label": "Accuracy" }, { "value": str(round(avg_precision, 3)), "label": "Precision" }, { "value": str(round(avg_recall, 3)), "label": "Recall" }, { "value": str(round(avg_f1, 3)), "label": "F1" }, { "value": str(round(roc_auc, 3)), "label": "ROC AUC" }, { "value": str(round(log_loss, 3)), "label": "LogLoss" } ] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: if target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #calculate quality metrics accuracy_score = metrics.accuracy_score(production_data[target_column], production_data[prediction_column]) avg_precision = metrics.precision_score(production_data[target_column], production_data[prediction_column], average='macro') avg_recall = metrics.recall_score(production_data[target_column], production_data[prediction_column], average='macro') avg_f1 = metrics.f1_score(production_data[target_column], production_data[prediction_column], average='macro') self.wi = BaseWidgetInfo( title=self.title, type="counter", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "counters": [ { "value": str(round(accuracy_score, 3)), "label": "Accuracy" }, { "value": str(round(avg_precision, 3)), "label": "Precision" }, { "value": str(round(avg_recall, 3)), "label": "Recall" }, { "value": str(round(avg_f1, 3)), "label": "F1" } ] }, additionalGraphs=[], ) else: self.wi = None else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None and prediction_column is not None: self.wi = BaseWidgetInfo( title=self.title, type="counter", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "counters": [{ "value": "", "label": "Regression Model Performance Report. Target:'" + target_column + "'" }] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) #calculate quality metrics me = np.mean(reference_data[prediction_column] - reference_data[target_column]) sde = np.std(reference_data[prediction_column] - reference_data[target_column], ddof = 1) abs_err = list(map(lambda x : abs(x[0] - x[1]), zip(reference_data[target_column], reference_data[prediction_column]))) mae = np.mean(abs_err) sdae = np.std(abs_err, ddof = 1) abs_perc_err = list(map(lambda x : 100*abs(x[0] - x[1])/x[0], zip(reference_data[target_column], reference_data[prediction_column]))) mape = np.mean(abs_perc_err) sdape = np.std(abs_perc_err, ddof = 1) #sqrt_err = list(map(lambda x : (x[0] - x[1])**2, # zip(reference_data[target_column], reference_data[prediction_column]))) #mse = np.mean(sqrt_err) #sdse = np.std(sqrt_err, ddof = 1) #error_norm_json = json.loads(error_norm.to_json()) self.wi = BaseWidgetInfo( title="Reference: Model Quality (+/- std)", type="counter", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "counters": [ { "value": str(round(me, 2)) + " (" + str(round(sde,2)) + ")", "label": "ME" }, { "value": str(round(mae, 2)) + " (" + str(round(sdae,2)) + ")", "label": "MAE" }, { "value": str(round(mape, 2)) + " (" + str(round(sdape, 2)) + ")", "label": "MAPE" }#, #{ # "value": str(round(mse, 2)) + " (" + str(round(sdse, 2)) + ")", # "label": "MSE" #} ] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: if target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #plot output correlations error_norm = go.Figure() error = production_data[prediction_column] - production_data[ target_column] qq_lines = probplot(error, dist="norm", plot=None) theoretical_q_x = np.linspace(qq_lines[0][0][0], qq_lines[0][0][-1], 100) sample_quantile_trace = go.Scatter(x=qq_lines[0][0], y=qq_lines[0][1], mode='markers', name='Dataset Quantiles', marker=dict(size=6, color=red)) theoretical_quantile_trace = go.Scatter( x=theoretical_q_x, y=qq_lines[1][0] * theoretical_q_x + qq_lines[1][1], mode='lines', name='Theoretical Quantiles', marker=dict(size=6, color=grey)) error_norm.add_trace(sample_quantile_trace) error_norm.add_trace(theoretical_quantile_trace) error_norm.update_layout(xaxis_title="Theoretical Quantiles", yaxis_title="Dataset Quantiles", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)) error_norm_json = json.loads(error_norm.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": error_norm_json['data'], "layout": error_norm_json['layout'] }, additionalGraphs=[], ) else: self.wi = None else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) #array_prediction = reference_data[prediction_column].to_numpy() #prediction_ids = np.argmax(array_prediction, axis=-1) #prediction_labels = [prediction_column[x] for x in prediction_ids] if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(reference_data[target_column])) binaraized_target.columns = ['target'] p, r, thrs = metrics.precision_recall_curve( binaraized_target, reference_data[prediction_column[0]]) #problem!!! fig = go.Figure() fig.add_trace( go.Scatter(x=p, y=r, mode='lines', name='PR', marker=dict( size=6, color=red, ))) fig.update_layout(yaxis_title="Precision", xaxis_title="Recall", showlegend=True) fig_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={ "data": fig_json['data'], "layout": fig_json['layout'] }, additionalGraphs=[], ) else: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(reference_data[target_column])) binaraized_target.columns = prediction_column #plot support bar graphs = [] for label in prediction_column: p, r, thrs = metrics.precision_recall_curve( binaraized_target[label], reference_data[label]) fig = go.Figure() fig.add_trace( go.Scatter(x=p, y=r, mode='lines', name='PR', marker=dict( size=6, color=red, ))) fig.update_layout(yaxis_title="Precision", xaxis_title="Recall", showlegend=True) fig_json = json.loads(fig.to_json()) graphs.append({ "id": "tab_" + str(label), "title": str(label), "graph": { "data": fig_json["data"], "layout": fig_json["layout"], } }) self.wi = BaseWidgetInfo( title=self.title, type="tabbed_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={"graphs": graphs}, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) target_names = None if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) array_prediction = reference_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot confusion matrix conf_matrix = metrics.confusion_matrix( reference_data[target_column], prediction_labels) z = conf_matrix.astype(int) labels = sorted(set(reference_data[target_column])) # change each element of z to type string for annotations z_text = [[str(y) for y in x] for x in z] fig = ff.create_annotated_heatmap(z, x=labels, y=labels, annotation_text=z_text, colorscale='bluered', showscale=True) fig.update_layout(xaxis_title="Predicted value", yaxis_title="Actual value") conf_matrix_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={ "data": conf_matrix_json['data'], "layout": conf_matrix_json['layout'] }, additionalGraphs=[], ) else: self.wi = None