def register_observation(self, features_existence, real_value, predicted_value): """ # Register an observation in the validator's internal buffers :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param real_value: The real value/label for this prediction :param predicted_value: The predicted value/label :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value` """ predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float( predicted_value) try: real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float( str(real_value).replace(',', '.')) except: real_value = None if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats) real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = X + features_existence self.X_buff.append(X) self.Y_buff.append(real_value_b) else: predicted_value_b = predicted_value real_value_b = real_value self.X_buff.append(features_existence) self.Y_buff.append(real_value_b == predicted_value_b)
def register_observation(self, features_existence, real_value, predicted_value, hmd=None): """ # Register an observation in the validator's internal buffers :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param real_value: The real value/label for this prediction :param predicted_value: The predicted value/label :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value` """ try: predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float( predicted_value) except: predicted_value = None try: real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float( str(real_value).replace(',', '.')) except: real_value = None if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats, hmd) real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats, hmd) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = X + features_existence self._X_buff.append(X) self._Y_buff.append(real_value_b) self._real_buckets_buff = self._Y_buff self._predicted_buckets_buff.append(predicted_value_b) # If no column is ignored, compute the accuracy for this bucket nr_missing_features = len( [x for x in features_existence if x is False or x is 0]) if nr_missing_features == 0: if real_value_b not in self.bucket_accuracy: self.bucket_accuracy[real_value_b] = [] self.bucket_accuracy[real_value_b].append( int(real_value_b == predicted_value_b)) else: predicted_value_b = predicted_value real_value_b = real_value self._X_buff.append(features_existence) self._Y_buff.append(real_value_b == predicted_value_b) self._real_buckets_buff.append(real_value_b) self._predicted_buckets_buff.append(predicted_value_b)
def evaluate_prediction_accuracy(self, features_existence, predicted_value): """ # Fit the probabilistic validator on an observation def evaluate_prediction_accuracy(self, features_existence, predicted_value): :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param predicted_value: The predicted value/label :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value) """ if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = [X + features_existence] else: X = [features_existence] #X = [[predicted_value_b, *features_existence]] log_types = np.seterr() np.seterr(divide='ignore') distribution = self._probabilistic_model.predict_proba(np.array(X)) np.seterr(divide=log_types['divide']) if self.buckets is not None: return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability else: return distribution[0][1]
def evaluate_prediction_accuracy(self, features_existence, predicted_value): """ # Fit the probabilistic validator on an observation :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param predicted_value: The predicted value/label :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value) """ if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats) X = [0] * (len(self.buckets) + 1) X[predicted_value_b] = 1 X = [X + features_existence] else: X = [features_existence] try: true_index = self._probabilistic_model.classes_.tolist().index( True) except: print('Only got classes: ', str(self._probabilistic_model.classes_.tolist()), ' in the probabilistic model\'s Y vector !') true_index = None if true_index is None: probability_true_prediction = 0 else: probability_true_prediction = self._probabilistic_model.predict_proba( np.array(X))[0][true_index] return probability_true_prediction
def compute_outlier_buckets(outlier_values, hist_x, hist_y, percentage_buckets, col_stats): outlier_buckets = [] # map each bucket to list of outliers in it bucket_outliers = defaultdict(list) for value in outlier_values: vb_index = get_value_bucket(value, percentage_buckets, col_stats) vb = percentage_buckets[vb_index] bucket_outliers[vb].append(value) # Filter out buckets without outliers, # then sort by number of outliers in ascending order buckets_with_outliers = sorted(filter(lambda kv: len(kv[1]) > 0, bucket_outliers.items()), key=lambda kv: len(kv[1])) for i, (bucket, outlier_values) in enumerate(buckets_with_outliers): bucket_index = hist_x.index(bucket) bucket_values_num = hist_y[bucket_index] bucket_outliers_num = len(outlier_values) # Is the bucket in the 95th percentile by number of outliers? percentile_outlier = ((i + 1) / len(buckets_with_outliers)) >= 0.95 # Are half of values in the bucket outliers? predominantly_outlier = False if bucket_values_num: predominantly_outlier = (bucket_outliers_num / bucket_values_num) > 0.5 if predominantly_outlier or percentile_outlier: outlier_buckets.append(bucket) return outlier_buckets
def evaluate_prediction_accuracy(self, features_existence, predicted_value, always_use_model_prediction): """ # Fit the probabilistic validator on an observation :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param predicted_value: The predicted value/label :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value) """ if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = [X + features_existence] else: X = [features_existence] distribution = self._probabilistic_model.predict_proba(np.array(X))[0] distribution = distribution.tolist() if len([x for x in distribution if x > 0.01]) > 4: # @HACK mean = np.mean(distribution) std = np.std(distribution) distribution = [x if x > (mean - std) else 0 for x in distribution] sum_dist = sum(distribution) # Avoid divison by zero in certain edge cases sum_dist = 0.00001 if sum_dist == 0 else sum_dist distribution = [x / sum_dist for x in distribution] min_val = min([x for x in distribution if x > 0.001]) distribution = [ x - min_val if x > min_val else 0 for x in distribution ] sum_dist = sum(distribution) # Avoid divison by zero in certain edge cases sum_dist = 0.00001 if sum_dist == 0 else sum_dist distribution = [x / sum_dist for x in distribution] # @HACK else: pass return ProbabilityEvaluation(self.buckets, distribution, predicted_value, always_use_model_prediction)
def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats): columnless_prediction_distribution = {} all_columns_prediction_distribution = {} self.normal_predictions = model.predict('validate') normal_accuracy = evaluate_accuracy(self.normal_predictions, full_dataset, stats, output_columns) column_importance_dict = {} buckets_stats = {} # Histogram for when all columns are present, in order to plot the force vectors for output_column in output_columns: stats_generator = StatsGenerator(session=None, transaction=self.transaction) input_data = TransactionData() input_data.data_frame = self.normal_predictions[[output_column]] input_data.columns = [output_column] # @TODO: Running stats generator just to get the histogram is very inefficient, change this validation_set_output_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) if validation_set_output_stats is None: pass elif 'histogram' in validation_set_output_stats[output_column]: all_columns_prediction_distribution[ output_column] = validation_set_output_stats[ output_column]['histogram'] ignorable_input_columns = [] for input_column in input_columns: if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH: ignorable_input_columns.append(input_column) for input_column in ignorable_input_columns: # See what happens with the accuracy of the outputs if only this column is present ignore_columns = [ col for col in ignorable_input_columns if col != input_column ] col_only_predictions = model.predict('validate', ignore_columns) col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns) col_only_normalized_accuracy = col_only_accuracy / normal_accuracy # See what happens with the accuracy if all columns but this one are present ignore_columns = [input_column] col_missing_predictions = model.predict('validate', ignore_columns) col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns) col_missing_reverse_accuracy = ( normal_accuracy - col_missing_accuracy) / normal_accuracy column_importance = (col_only_normalized_accuracy + col_missing_reverse_accuracy) / 2 column_importance_dict[input_column] = column_importance # Histogram for when the column is missing, in order to plot the force vectors for output_column in output_columns: if output_column not in columnless_prediction_distribution: columnless_prediction_distribution[output_column] = {} stats_generator = StatsGenerator(session=None, transaction=self.transaction) input_data = TransactionData() input_data.data_frame = col_missing_predictions[[ output_column ]] input_data.columns = [output_column] # @TODO: Running stats generator just to get the histogram is very inefficient, change this col_missing_output_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) if col_missing_output_stats is None: pass elif 'histogram' in col_missing_output_stats[output_column]: columnless_prediction_distribution[output_column][ input_column] = col_missing_output_stats[ output_column]['histogram'] # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column #for column in ignorable_input_columns: # if c(column_importance_dict[column] > 0.8 or column_importance_dict[column] < 0.2): for output_column in output_columns: buckets_stats[output_column] = {} bucket_indexes = {} for index, row in full_dataset.iterrows(): print(index) value = row[output_column] if 'percentage_buckets' in stats[output_column]: percentage_buckets = stats[output_column][ 'percentage_buckets'] else: percentage_buckets = None value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column]) if value_bucket not in bucket_indexes: bucket_indexes[value_bucket] = [] bucket_indexes[value_bucket].append(index) for bucket in bucket_indexes: buckets_stats[output_column][bucket] = {} input_data = TransactionData() input_data.data_frame = full_dataset.loc[ bucket_indexes[bucket]] input_data.columns = input_data.data_frame.columns stats_generator = StatsGenerator(session=None, transaction=self.transaction) try: col_buckets_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) buckets_stats[output_column][bucket].update( col_buckets_stats) except: print('Cloud not generate bucket stats for sub-bucket: {}'. format(bucket)) return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats): columnless_prediction_distribution = {} all_columns_prediction_distribution = {} self.normal_predictions = model.predict('validate') normal_accuracy = evaluate_accuracy(self.normal_predictions, full_dataset, stats, output_columns) column_importance_dict = {} buckets_stats = {} # Histogram for when all columns are present, in order to plot the force vectors for output_column in output_columns: stats_generator = StatsGenerator(session=None, transaction=self.transaction) input_data = TransactionData() input_data.data_array = list( map(lambda x: [x], list(self.normal_predictions[output_column]))) input_data.columns = [output_column] validation_set_output_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) if validation_set_output_stats is None: pass elif 'histogram' in validation_set_output_stats[output_column]: all_columns_prediction_distribution[ output_column] = validation_set_output_stats[ output_column]['histogram'] ignorable_input_columns = [] for input_column in input_columns: if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH: ignorable_input_columns.append(input_column) for input_column in ignorable_input_columns: # See what happens with the accuracy of the outputs if only this column is present ignore_columns = [ col for col in ignorable_input_columns if col != input_column ] col_only_predictions = model.predict('validate', ignore_columns) col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns) col_only_normalized_accuracy = col_only_accuracy / normal_accuracy # See what happens with the accuracy if all columns but this one are present ignore_columns = [input_column] col_missing_predictions = model.predict('validate', ignore_columns) col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns) col_missing_reverse_accuracy = ( normal_accuracy - col_missing_accuracy) / normal_accuracy column_importance = (col_only_normalized_accuracy + col_missing_reverse_accuracy) / 2 column_importance_dict[input_column] = column_importance # Histogram for when the column is missing, in order to plot the force vectors for output_column in output_columns: if output_column not in columnless_prediction_distribution: columnless_prediction_distribution[output_column] = {} stats_generator = StatsGenerator(session=None, transaction=self.transaction) input_data = TransactionData() input_data.data_array = list( map(lambda x: [x], list(col_missing_predictions[output_column]))) input_data.columns = [output_column] col_missing_output_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) if col_missing_output_stats is None: pass elif 'histogram' in col_missing_output_stats[output_column]: columnless_prediction_distribution[output_column][ input_column] = col_missing_output_stats[ output_column]['histogram'] # If this coulmn is either very important or not important at all, compute stats for each of the buckets (in the validation data) if column_importance > 0.8 or column_importance < 0.2: split_data = {} for value in full_dataset[input_column]: if 'percentage_buckets' in stats[input_column]: bucket = stats[input_column]['percentage_buckets'] else: bucket = None vb = get_value_bucket(value, bucket, stats[input_column]) if f'{input_column}_bucket_{vb}' not in split_data: split_data[f'{input_column}_bucket_{vb}'] = [] split_data[f'{input_column}_bucket_{vb}'].append(value) row_wise_data = [] max_length = max(list(map(len, split_data.values()))) columns = [] for i in range(max_length): row_wise_data.append([]) for k in split_data.keys(): # If the sub bucket has less than 6 values, it's no relevant if len(split_data[k]) > 6: columns.append(k) if len(split_data[k]) > i: row_wise_data[-1].append(split_data[k][i]) else: row_wise_data[-1].append(None) input_data = TransactionData() input_data.data_array = row_wise_data input_data.columns = columns stats_generator = StatsGenerator(session=None, transaction=self.transaction) col_buckets_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) buckets_stats.update(col_buckets_stats) return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
def fit(self, real_df, predictions_arr, missing_col_arr, hmd=None): """ # Fit the probabilistic validator :param real_df: A dataframe with the real inputs and outputs for every row :param predictions_arr: An array containing arrays of predictions, one containing the "normal" predictions and the rest containing predictions with various missing column :param missing_col_arr: The missing columns for each of the prediction arrays, same order as the arrays in `predictions_arr`, starting from the second element of `predictions_arr` (The first is assumed to have no missing columns) """ self.real_values_bucketized = [] self.normal_predictions_bucketized = [] column_indexes = {} for i, col in enumerate(self.input_columns): column_indexes[col] = i real_present_inputs_arr = [] for _, row in real_df.iterrows(): present_inputs = [1] * len(self.input_columns) for i, col in enumerate(self.input_columns): if str(row[col]) in ('None', 'nan', '', 'Nan', 'NAN', 'NaN'): present_inputs[i] = 0 real_present_inputs_arr.append(present_inputs) X = [] Y = [] for n in range(len(predictions_arr)): for m in range(len(real_df)): row = real_df.iloc[m] predicted_value = predictions_arr[n][self.col_name][m] real_value = row[self.col_name] try: predicted_value = predicted_value if self.col_stats[ 'data_type'] != DATA_TYPES.NUMERIC else float( predicted_value) except: predicted_value = None try: real_value = real_value if self.col_stats[ 'data_type'] != DATA_TYPES.NUMERIC else float( str(real_value).replace(',', '.')) except: real_value = None if self.buckets is not None: predicted_value_b = get_value_bucket( predicted_value, self.buckets, self.col_stats, hmd) real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats, hmd) X.append([0] * (len(self.buckets) + 1)) X[-1][predicted_value_b] = 1 else: predicted_value_b = predicted_value real_value_b = real_value_b X.append([]) Y.append(real_value_b == predicted_value_b) if n == 0: self.real_values_bucketized.append(real_value_b) self.normal_predictions_bucketized.append( predicted_value_b) feature_existance = real_present_inputs_arr[m] if n > 0: for missing_col in missing_col_arr[n - 1]: feature_existance[self.input_columns.index( missing_col)] = 0 X[-1] += feature_existance log_types = np.seterr() np.seterr(divide='ignore') self._probabilistic_model.fit(X, Y) np.seterr(divide=log_types['divide'])
def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats): columnless_prediction_distribution = {} all_columns_prediction_distribution = {} with disable_console_output(True): normal_predictions = model.predict('validate') normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns) column_importance_dict = {} buckets_stats = {} # Histogram for when all columns are present, in order to plot the force vectors for output_column in output_columns: # @TODO: Running stats generator just to get the histogram is very inefficient, change this validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(normal_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype']) if validation_set_output_column_histogram is not None: all_columns_prediction_distribution[output_column] = validation_set_output_column_histogram ignorable_input_columns = [] for input_column in input_columns: if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]: ignorable_input_columns.append(input_column) for input_column in ignorable_input_columns: # See what happens with the accuracy of the outputs if only this column is present ignore_columns = [col for col in ignorable_input_columns if col != input_column] with disable_console_output(True): col_only_predictions = model.predict('validate', ignore_columns) col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns) # See what happens with the accuracy if all columns but this one are present ignore_columns = [input_column] with disable_console_output(True): col_missing_predictions = model.predict('validate', ignore_columns) col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns) combined_column_accuracy = ((normal_accuracy - col_missing_accuracy) + col_only_accuracy)/2 if combined_column_accuracy < 0: combined_column_accuracy = 0 column_importance = 10*(1 - (normal_accuracy - combined_column_accuracy)/normal_accuracy) if column_importance < 1: column_importance = 1 column_importance_dict[input_column] = column_importance # Histogram for when the column is missing, in order to plot the force vectors for output_column in output_columns: if output_column not in columnless_prediction_distribution: columnless_prediction_distribution[output_column] = {} col_missing_output_histogram, _ = StatsGenerator.get_histogram(col_missing_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype']) if col_missing_output_histogram is not None: columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column for output_column in output_columns: buckets_stats[output_column] = {} bucket_indexes = {} for index,row in full_dataset.iterrows(): value = row[output_column] if 'percentage_buckets' in stats[output_column]: percentage_buckets = stats[output_column]['percentage_buckets'] else: percentage_buckets = None value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd) if value_bucket not in bucket_indexes: bucket_indexes[value_bucket] = [] bucket_indexes[value_bucket].append(index) for bucket in bucket_indexes: buckets_stats[output_column][bucket] = {} input_data = TransactionData() input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]] input_data.columns = input_data.columns stats_generator = StatsGenerator(session=None, transaction=self.transaction) try: with disable_console_output(): col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False) buckets_stats[output_column][bucket].update(col_buckets_stats) except Exception as e: pass return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution