Esempio n. 1
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        self.normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(self.normal_predictions,
                                            full_dataset, stats,
                                            output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            stats_generator = StatsGenerator(session=None,
                                             transaction=self.transaction)
            input_data = TransactionData()
            input_data.data_array = list(
                map(lambda x: [x],
                    list(self.normal_predictions[output_column])))
            input_data.columns = [output_column]
            validation_set_output_stats = stats_generator.run(
                input_data=input_data, modify_light_metadata=False)

            if validation_set_output_stats is None:
                pass
            elif 'histogram' in validation_set_output_stats[output_column]:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_stats[
                        output_column]['histogram']

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}
                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                input_data = TransactionData()
                input_data.data_array = list(
                    map(lambda x: [x],
                        list(col_missing_predictions[output_column])))
                input_data.columns = [output_column]
                col_missing_output_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][
                        input_column] = col_missing_output_stats[
                            output_column]['histogram']

            # If this coulmn is either very important or not important at all, compute stats for each of the buckets (in the validation data)
            if column_importance > 0.8 or column_importance < 0.2:
                split_data = {}
                for value in full_dataset[input_column]:

                    if 'percentage_buckets' in stats[input_column]:
                        bucket = stats[input_column]['percentage_buckets']
                    else:
                        bucket = None

                    vb = get_value_bucket(value, bucket, stats[input_column])
                    if f'{input_column}_bucket_{vb}' not in split_data:
                        split_data[f'{input_column}_bucket_{vb}'] = []

                    split_data[f'{input_column}_bucket_{vb}'].append(value)

                row_wise_data = []
                max_length = max(list(map(len, split_data.values())))

                columns = []
                for i in range(max_length):
                    row_wise_data.append([])
                    for k in split_data.keys():
                        # If the sub bucket has less than 6 values, it's no relevant
                        if len(split_data[k]) > 6:
                            columns.append(k)
                            if len(split_data[k]) > i:
                                row_wise_data[-1].append(split_data[k][i])
                            else:
                                row_wise_data[-1].append(None)

                input_data = TransactionData()
                input_data.data_array = row_wise_data
                input_data.columns = columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                col_buckets_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                buckets_stats.update(col_buckets_stats)

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
Esempio n. 2
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        self.normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(self.normal_predictions,
                                            full_dataset, stats,
                                            output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            stats_generator = StatsGenerator(session=None,
                                             transaction=self.transaction)
            input_data = TransactionData()
            input_data.data_frame = self.normal_predictions[[output_column]]
            input_data.columns = [output_column]
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_stats = stats_generator.run(
                input_data=input_data, modify_light_metadata=False)

            if validation_set_output_stats is None:
                pass
            elif 'histogram' in validation_set_output_stats[output_column]:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_stats[
                        output_column]['histogram']

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}
                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                input_data = TransactionData()
                input_data.data_frame = col_missing_predictions[[
                    output_column
                ]]
                input_data.columns = [output_column]

                # @TODO: Running stats generator just to get the histogram is very inefficient, change this
                col_missing_output_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][
                        input_column] = col_missing_output_stats[
                            output_column]['histogram']

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        #for column in ignorable_input_columns:
        #    if c(column_importance_dict[column] > 0.8 or column_importance_dict[column] < 0.2):

        for output_column in output_columns:
            buckets_stats[output_column] = {}

            bucket_indexes = {}
            for index, row in full_dataset.iterrows():
                print(index)
                value = row[output_column]
                if 'percentage_buckets' in stats[output_column]:
                    percentage_buckets = stats[output_column][
                        'percentage_buckets']
                else:
                    percentage_buckets = None

                value_bucket = get_value_bucket(value, percentage_buckets,
                                                stats[output_column])
                if value_bucket not in bucket_indexes:
                    bucket_indexes[value_bucket] = []
                bucket_indexes[value_bucket].append(index)

            for bucket in bucket_indexes:
                buckets_stats[output_column][bucket] = {}
                input_data = TransactionData()
                input_data.data_frame = full_dataset.loc[
                    bucket_indexes[bucket]]
                input_data.columns = input_data.data_frame.columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                try:
                    col_buckets_stats = stats_generator.run(
                        input_data=input_data, modify_light_metadata=False)
                    buckets_stats[output_column][bucket].update(
                        col_buckets_stats)
                except:
                    print('Cloud not generate bucket stats for sub-bucket: {}'.
                          format(bucket))

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
Esempio n. 3
0
    def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        with disable_console_output(True):
            normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(normal_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

            if validation_set_output_column_histogram is not None:
                all_columns_prediction_distribution[output_column] = validation_set_output_column_histogram

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [col for col in ignorable_input_columns if col != input_column]
            with disable_console_output(True):
                col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns)

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            with disable_console_output(True):
                col_missing_predictions = model.predict('validate', ignore_columns)
            col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns)

            combined_column_accuracy = ((normal_accuracy - col_missing_accuracy) + col_only_accuracy)/2
            if combined_column_accuracy < 0:
                combined_column_accuracy = 0
            column_importance = 10*(1 - (normal_accuracy - combined_column_accuracy)/normal_accuracy)
            if column_importance < 1:
                column_importance = 1
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}

                col_missing_output_histogram, _ = StatsGenerator.get_histogram(col_missing_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

                if col_missing_output_histogram is not None:
                    columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        for output_column in output_columns:
                buckets_stats[output_column] = {}

                bucket_indexes = {}
                for index,row in full_dataset.iterrows():
                    value = row[output_column]
                    if 'percentage_buckets' in stats[output_column]:
                        percentage_buckets = stats[output_column]['percentage_buckets']
                    else:
                        percentage_buckets = None

                    value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd)
                    if value_bucket not in bucket_indexes:
                        bucket_indexes[value_bucket] = []
                    bucket_indexes[value_bucket].append(index)

                for bucket in bucket_indexes:
                    buckets_stats[output_column][bucket] = {}
                    input_data = TransactionData()
                    input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]]
                    input_data.columns = input_data.columns

                    stats_generator = StatsGenerator(session=None, transaction=self.transaction)
                    try:
                        with disable_console_output():
                            col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False)
                        buckets_stats[output_column][bucket].update(col_buckets_stats)
                    except Exception as e:
                        pass

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution