def get_important_missing_cols(lmd, prediction_row, pred_col): if lmd['column_importances'] is None or len(lmd['column_importances']) < 2: important_cols = [ col for col in lmd['columns'] if col not in lmd['predict_columns'] and not col.startswith('model_') ] else: top_30_val = np.percentile(list(lmd['column_importances'].values()), 70) important_cols = [ col for col in lmd['column_importances'] if lmd['column_importances'][col] >= top_30_val ] important_missing_cols = [] for col in important_cols: if col not in prediction_row or prediction_row[col] is None or str( prediction_row[col]) == '' or value_isnan(prediction_row[col]): important_missing_cols.append(col) return important_missing_cols
def explain_prediction(lmd, prediction_row, confidence, pred_col): ''' Given a row contianing a prediction, it tries to use the information in it plus the metadata generated by the ModelAnalyzer and StatsGenerator in order to explain "why" the prediction took said value. :param prediction_row: The row that was predicted by the model backend and processed by mindsdb :return: A, hopefully human readable, string containing the explanation ''' if lmd['column_importances'] is None or len(lmd['column_importances']) < 2: important_cols = [ col for col in lmd['columns'] if col not in lmd['predict_columns'] ] useless_cols = [] else: top_20_val = np.percentile(list(lmd['column_importances'].values()), 80) bottom_20_val = np.percentile(list(lmd['column_importances'].values()), 20) important_cols = [ col for col in lmd['column_importances'] if lmd['column_importances'][col] >= top_20_val ] useless_cols = [ col for col in lmd['column_importances'] if lmd['column_importances'][col] <= bottom_20_val ] explain_predictions = {} predicted_val = prediction_row[pred_col] col_stats = lmd['column_stats'][pred_col] col_type = col_stats['data_type'] if 'histogram' in col_stats and ['histogram'] is not None: histogram_x = None histogram_keys = col_stats['histogram']['x'] if col_type == DATA_TYPES.NUMERIC: for i in range(len(histogram_keys)): if i == 0: if histogram_keys[i] >= predicted_val: histogram_x = histogram_keys[0] if i == len(histogram_keys) - 1: if histogram_keys[i] <= predicted_val: histogram_x = histogram_keys[0] else: if histogram_keys[ i] <= predicted_val or predicted_val > histogram_keys[ i + 1]: histogram_x = histogram_keys[0] else: histogram_x = predicted_val bucket_occurances = col_stats['histogram']['y'][ col_stats['histogram']['x'].index(histogram_x)] total_occuraces = sum(col_stats['histogram']['y']) percentage_bucket_percentage = round( 100 * bucket_occurances / total_occuraces, 2) column_confidence = confidence * 100 confidence_str = 'very confident' if confidence < 0.80: confidence_str = 'confident' if confidence < 0.60: confidence_str = 'somewhat confident' if confidence < 0.40: confidence_str = 'not very confident' if confidence < 0.20: confidence_str = 'not confident' if percentage_bucket_percentage < 2: column_explanation = f'A similar value for the predicted column {pred_col} occurs rarely in your dataset' if column_confidence >= 70: column_explanation += ', in spite of this, due to the quality of the input data and the model, we are very confident this prediction is correct.' if column_confidence < 70 and column_confidence > 30: column_explanation += ', it is partially because of this reason that we are only somewhat confident this prediction is correct.' if column_confidence <= 30: column_explanation += ', it is partially because of this reason that we aren\'t confident this prediction is correct.' elif percentage_bucket_percentage < 12.5: column_explanation = f'A similar value for the predicted column {pred_col} occurs a moderate amount of times in your dataset' if column_confidence >= 70: column_explanation += ', we are very confident this prediction is correct.' if column_confidence < 70 and column_confidence > 30: column_explanation += ', we are only somewhat confident this prediction is correct.' if column_confidence <= 30: column_explanation += ', you\'r input data might be of sub-par qualkity, since we aren\'t confident this prediction is correct.' else: column_explanation = f'A similar value for the predicted column {pred_col} occurs very often in your dataset' if column_confidence >= 70: column_explanation += ', it\'s partially because of this plaethora of examples that we can be very confident this prediction is correct.' if column_confidence < 70 and column_confidence > 30: column_explanation += ', still we can only be somewhat confident this prediction is correct.' if column_confidence <= 30: column_explanation += ', in spite of this, possibly due to poor input data quality, we aren\'t confident this prediction is correct.' explain_predictions[pred_col] = column_explanation else: explain_predictions[ pred_col] = f'Column {pred_col} is of type {col_type} and thus it\'s impossible for us to make statistical inferences about it.' explain_inputs = {} for icol in important_cols: if prediction_row[icol] is None or value_isnan(prediction_row[icol]): explain_inputs[ icol] = f'The column {icol} is very important for this model to predict correctly. Since it\'s missing it\'s quite likely that the quality of this prediction is lacking because of this.' else: explain_inputs[ icol] = f'The value of the column {icol} played a large role in generating this prediction.' for icol in useless_cols: if prediction_row[icol] is None or value_isnan(prediction_row[icol]): explain_inputs[ icol] = f'The fact that {icol} is missing is probably not very relevant for this prediction.' else: explain_inputs[ icol] = f'The column {icol} is probably not very relevant for this prediction.' join_str = '\n\n*' explanation = join_str + join_str.join(explain_predictions.values()) explanation += join_str + join_str.join(explain_inputs.values()) return explanation