Exemple #1
0
    def run(self):
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [
            col for col in self.transaction.lmd['columns']
            if col not in output_columns
            and col not in self.transaction.lmd['malformed_columns']['names']
        ]
        validation_dataset = {}

        for row_ind in self.transaction.input_data.validation_indexes[
                KEY_NO_GROUP_BY]:
            for col_ind, col in enumerate(self.transaction.lmd['columns']):
                if col in self.transaction.lmd['malformed_columns']['names']:
                    continue
                if col not in validation_dataset:
                    validation_dataset[col] = []
                validation_dataset[col].append(
                    self.transaction.input_data.data_array[row_ind][col_ind])

        # Test some hypotheses about our columns
        column_evaluator = ColumnEvaluator(self.transaction)
        column_importances, buckets_stats = column_evaluator.get_column_importance(
            model=self.transaction.model_backend,
            output_columns=output_columns,
            input_columns=input_columns,
            full_dataset=validation_dataset,
            stats=self.transaction.lmd['column_stats'])
        self.transaction.lmd['column_importances'] = column_importances
        self.transaction.lmd[
            'unusual_columns_buckets_importances'] = buckets_stats

        # Create the probabilistic validators for each of the predict column
        probabilistic_validators = {}
        for col in output_columns:
            if 'percentage_buckets' in self.transaction.lmd['column_stats'][
                    col]:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])
            else:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])

        # Run on the validation set multiple times, each time with one of the column blanked out
        for column_name in input_columns:
            ignore_columns = []
            ignore_columns.append(column_name)

            predictions = self.transaction.model_backend.predict(
                'validate', ignore_columns)

            # create a vector that has True for each feature that was passed to the model tester and False if it was blanked
            features_existence = [
                True if np_col not in ignore_columns else False
                for np_col in input_columns
            ]

            # A separate probabilistic model is trained for each predicted column, we may want to change this in the future, @TODO
            for pcol in output_columns:
                for i in range(len(predictions[pcol])):
                    predicted_val = predictions[pcol][i]
                    real_val = validation_dataset[pcol][i]
                    probabilistic_validators[pcol].register_observation(
                        features_existence=features_existence,
                        real_value=real_val,
                        predicted_value=predicted_val)

        for pcol in output_columns:
            probabilistic_validators[pcol].partial_fit()

        # Pickle for later use
        self.transaction.hmd['probabilistic_validators'] = {}
        for col in probabilistic_validators:
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(
                probabilistic_validators[col])
Exemple #2
0
    def run(self):
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['malformed_columns']['names']]

        # Test some hypotheses about our columns

        if self.transaction.lmd['disable_optional_analysis'] is False:
            column_evaluator = ColumnEvaluator(self.transaction)
            column_importances, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution = column_evaluator.get_column_importance(model=self.transaction.model_backend, output_columns=output_columns, input_columns=input_columns, full_dataset=self.transaction.input_data.validation_df, stats=self.transaction.lmd['column_stats'])

            self.transaction.lmd['column_importances'] = column_importances
            self.transaction.lmd['columns_buckets_importances'] = buckets_stats
            self.transaction.lmd['columnless_prediction_distribution'] = columnless_prediction_distribution
            self.transaction.lmd['all_columns_prediction_distribution'] = all_columns_prediction_distribution

        # Create the probabilistic validators for each of the predict column
        probabilistic_validators = {}
        for col in output_columns:
            if 'percentage_buckets' in self.transaction.lmd['column_stats'][col]:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])
            else:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])

        ignorable_input_columns = []
        for input_column in input_columns:
            if self.transaction.lmd['column_stats'][input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)


        predictions = self.transaction.model_backend.predict('validate')
        for pcol in output_columns:
            i = 0
            for real_val in self.transaction.input_data.validation_df[pcol]:
                predicted_val = predictions[pcol][i]
                probabilistic_validators[pcol].register_observation(features_existence=[True for col in input_columns], real_value=real_val, predicted_value=predicted_val)
                i += 1

        # Run on the validation set multiple times, each time with one of the column blanked out
        for column_name in ignorable_input_columns:
            ignore_columns = []
            ignore_columns.append(column_name)

            predictions = self.transaction.model_backend.predict('validate', ignore_columns)

            # create a vector that has True for each feature that was passed to the model tester and False if it was blanked
            features_existence = [True if np_col not in ignore_columns else False for np_col in input_columns]

            # A separate probabilistic model is trained for each predicted column, we may want to change this in the future, @TODO
            for pcol in output_columns:
                i = 0
                for real_val in self.transaction.input_data.validation_df:
                    predicted_val = predictions[pcol][i]
                    probabilistic_validators[pcol].register_observation(features_existence=features_existence, real_value=real_val, predicted_value=predicted_val)
                    i += 1

        self.transaction.lmd['accuracy_histogram'] = {}

        total_accuracy = 0
        for pcol in output_columns:
            probabilistic_validators[pcol].partial_fit()
            accuracy_histogram, validation_set_accuracy = probabilistic_validators[pcol].get_accuracy_histogram()
            self.transaction.lmd['accuracy_histogram'][pcol] = accuracy_histogram
            total_accuracy += validation_set_accuracy
        self.transaction.lmd['validation_set_accuracy'] = total_accuracy/len(output_columns)

        # Pickle for later use
        self.transaction.hmd['probabilistic_validators'] = {}
        for col in probabilistic_validators:
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(probabilistic_validators[col])