Example #1
0
    def _execute_predict(self):
        """
        :return:
        """
        old_lmd = {}
        for k in self.lmd:
            old_lmd[k] = self.lmd[k]

        old_hmd = {}
        for k in self.hmd:
            old_hmd[k] = self.hmd[k]
        self.load_metadata()

        for k in old_lmd:
            if old_lmd[k] is not None:
                self.lmd[k] = old_lmd[k]
            else:
                if k not in self.lmd:
                    self.lmd[k] = None

        for k in old_hmd:
            if old_hmd[k] is not None:
                self.hmd[k] = old_hmd[k]
            else:
                if k not in self.hmd:
                    self.hmd[k] = None

        if self.lmd is None:
            self.log.error('No metadata found for this model')
            return

        self._call_phase_module(clean_exit=True, module_name='DataExtractor')

        if self.input_data.data_frame.shape[0] <= 0:
            self.log.error('No input data provided !')
            return
        if self.lmd['model_is_time_series']:
            self._call_phase_module(clean_exit=True,
                                    module_name='DataSplitter')

        # @TODO Maybe move to a separate "PredictionAnalysis" phase ?
        if self.lmd['run_confidence_variation_analysis']:
            nulled_out_data = []
            nulled_out_columns = []
            for column in self.input_data.columns:
                # Only adapted for a single `when`
                if self.input_data.data_frame.iloc[0][column] is not None:
                    nulled_out_data.append(
                        self.input_data.data_frame.iloc[0].copy())
                    nulled_out_data[-1][column] = None
                    nulled_out_columns.append(column)

            nulled_out_data = pd.DataFrame(nulled_out_data)

        for mode in ['predict', 'analyze_confidence']:
            if mode == 'analyze_confidence':
                if not self.lmd['run_confidence_variation_analysis']:
                    continue
                else:
                    self.input_data.data_frame = nulled_out_data

            self._call_phase_module(clean_exit=True,
                                    module_name='DataTransformer',
                                    input_data=self.input_data)

            self._call_phase_module(clean_exit=True,
                                    module_name='ModelInterface',
                                    mode='predict')

            output_data = {col: [] for col in self.lmd['columns']}

            for column in self.input_data.columns:
                output_data[column] = list(self.input_data.data_frame[column])

            for predicted_col in self.lmd['predict_columns']:
                output_data[predicted_col] = list(
                    self.hmd['predictions'][predicted_col])
                for extra_column in [
                        f'{predicted_col}_model_confidence',
                        f'{predicted_col}_confidence_range'
                ]:
                    if extra_column in self.hmd['predictions']:
                        output_data[extra_column] = self.hmd['predictions'][
                            extra_column]

                probabilistic_validator = unpickle_obj(
                    self.hmd['probabilistic_validators'][predicted_col])
                output_data[f'{predicted_col}_confidence'] = [None] * len(
                    output_data[predicted_col])

                output_data[f'model_{predicted_col}'] = deepcopy(
                    output_data[predicted_col])
                for row_number, predicted_value in enumerate(
                        output_data[predicted_col]):

                    # Compute the feature existance vector
                    input_columns = [
                        col for col in self.input_data.columns
                        if col not in self.lmd['predict_columns']
                    ]
                    features_existance_vector = [
                        False if str(output_data[col][row_number])
                        in ('None', 'nan', '', 'Nan', 'NAN', 'NaN') else True
                        for col in input_columns
                        if col not in self.lmd['columns_to_ignore']
                    ]

                    # Create the probabilsitic evaluation
                    probability_true_prediction = probabilistic_validator.evaluate_prediction_accuracy(
                        features_existence=features_existance_vector,
                        predicted_value=predicted_value)

                    output_data[f'{predicted_col}_confidence'][
                        row_number] = probability_true_prediction

            if mode == 'predict':
                self.output_data = PredictTransactionOutputData(
                    transaction=self, data=output_data)
            else:
                nulled_out_predictions = PredictTransactionOutputData(
                    transaction=self, data=output_data)

        if self.lmd['run_confidence_variation_analysis']:
            input_confidence = {}
            extra_insights = {}

            for predicted_col in self.lmd['predict_columns']:
                input_confidence[predicted_col] = {}
                extra_insights[predicted_col] = {'if_missing': []}

                actual_confidence = self.output_data[0].explanation[
                    predicted_col]['confidence']

                for i, nulled_col_name in enumerate(nulled_out_columns):
                    nulled_out_predicted_value = nulled_out_predictions[
                        i].explanation[predicted_col]['predicted_value']
                    nulled_confidence = nulled_out_predictions[i].explanation[
                        predicted_col]['confidence']
                    print(actual_confidence - nulled_confidence,
                          actual_confidence, nulled_confidence)
                    confidence_variation = actual_confidence - nulled_confidence

                    input_confidence[predicted_col][nulled_col_name] = round(
                        confidence_variation, 3)
                    extra_insights[predicted_col]['if_missing'].append(
                        {nulled_col_name: nulled_out_predicted_value})

            self.output_data.input_confidence = input_confidence
            self.output_data.extra_insights = extra_insights
        return
Example #2
0
    def _execute_predict(self):
        """
        :return:
        """
        old_lmd = {}
        for k in self.lmd:
            old_lmd[k] = self.lmd[k]

        old_hmd = {}
        for k in self.hmd:
            old_hmd[k] = self.hmd[k]
        self.load_metadata()

        for k in old_lmd:
            if old_lmd[k] is not None:
                self.lmd[k] = old_lmd[k]
            else:
                if k not in self.lmd:
                    self.lmd[k] = None

        for k in old_hmd:
            if old_hmd[k] is not None:
                self.hmd[k] = old_hmd[k]
            else:
                if k not in self.hmd:
                    self.hmd[k] = None

        if self.lmd is None:
            self.log.error('No metadata found for this model')
            return

        self._call_phase_module(clean_exit=True, module_name='DataExtractor')

        if self.input_data.data_frame.shape[0] <= 0:
            self.log.error('No input data provided !')
            return

        self._call_phase_module(clean_exit=True,
                                module_name='DataTransformer',
                                input_data=self.input_data)

        self._call_phase_module(clean_exit=True,
                                module_name='ModelInterface',
                                mode='predict')

        output_data = {col: [] for col in self.lmd['columns']}
        evaluations = {}

        for column in self.input_data.data_frame.columns:
            output_data[column] = list(self.input_data.data_frame[column])

        for predicted_col in self.lmd['predict_columns']:
            output_data[predicted_col] = list(
                self.hmd['predictions'][predicted_col])

            probabilistic_validator = unpickle_obj(
                self.hmd['probabilistic_validators'][predicted_col])
            confidence_column_name = f'{predicted_col}_confidence'
            output_data[confidence_column_name] = [None] * len(
                output_data[predicted_col])
            evaluations[predicted_col] = [None] * len(
                output_data[predicted_col])

            for row_number, predicted_value in enumerate(
                    output_data[predicted_col]):

                # Compute the feature existance vector
                input_columns = [
                    col for col in self.input_data.columns
                    if col not in self.lmd['predict_columns']
                ]
                features_existance_vector = [
                    False if output_data[col][row_number] is None else True
                    for col in input_columns
                    if col not in self.lmd['malformed_columns']['names']
                ]

                # Create the probabilsitic evaluation
                prediction_evaluation = probabilistic_validator.evaluate_prediction_accuracy(
                    features_existence=features_existance_vector,
                    predicted_value=predicted_value)
                if type(prediction_evaluation) == float:
                    output_data[confidence_column_name][
                        row_number] = prediction_evaluation
                    evaluations[predicted_col][row_number] = None
                else:
                    output_data[confidence_column_name][
                        row_number] = prediction_evaluation.most_likely_probability
                    evaluations[predicted_col][
                        row_number] = prediction_evaluation

        self.output_data = PredictTransactionOutputData(
            transaction=self, data=output_data, evaluations=evaluations)

        return
Example #3
0
    def _execute_predict(self):
        """
        :return:
        """
        old_lmd = {}
        for k in self.lmd: old_lmd[k] = self.lmd[k]

        old_hmd = {}
        for k in self.hmd: old_hmd[k] = self.hmd[k]
        self.load_metadata()

        for k in old_lmd:
            if old_lmd[k] is not None:
                self.lmd[k] = old_lmd[k]
            else:
                if k not in self.lmd:
                    self.lmd[k] = None

        for k in old_hmd:
            if old_hmd[k] is not None:
                self.hmd[k] = old_hmd[k]
            else:
                if k not in self.hmd:
                    self.hmd[k] = None

        if self.lmd is None:
            self.log.error('No metadata found for this model')
            return

        self._call_phase_module(clean_exit=True, module_name='DataExtractor')

        if self.input_data.data_frame.shape[0] <= 0:
            self.output_data = self.input_data
            return

        self.output_data = PredictTransactionOutputData(transaction=self)

        self._call_phase_module(clean_exit=True, module_name='ModelInterface', mode='predict')

        self.output_data.data = {col: [] for i, col in enumerate(self.input_data.columns)}
        input_columns = [col for col in self.input_data.columns if col not in self.lmd['predict_columns']]

        for i, row in self.input_data.data_frame.iterrows():
            for index, cell in enumerate(row):
                col = self.input_data.columns[index]
                self.output_data.data[col].append(cell)

        for predicted_col in self.lmd['predict_columns']:
            probabilistic_validator = unpickle_obj(self.hmd['probabilistic_validators'][predicted_col])

            predicted_values = self.hmd['predictions'][predicted_col]
            self.output_data.data[predicted_col] = predicted_values
            confidence_column_name = "{col}_confidence".format(col=predicted_col)
            self.output_data.data[confidence_column_name] = [None] * len(predicted_values)
            self.output_data.evaluations[predicted_col] = [None] * len(predicted_values)

            for row_number, predicted_value in enumerate(predicted_values):
                features_existance_vector = [False if self.output_data.data[col][row_number] is None else True for col in input_columns if col not in self.lmd['malformed_columns']['names']]
                prediction_evaluation = probabilistic_validator.evaluate_prediction_accuracy(features_existence=features_existance_vector, predicted_value=predicted_value)
                self.output_data.data[confidence_column_name][row_number] = prediction_evaluation
                #output_data[col][row_number] = prediction_evaluation.most_likely_value Huh, is this correct, are we replacing the predicted value with the most likely one ? Seems... wrong
                self.output_data.evaluations[predicted_col][row_number] = prediction_evaluation

        #self.save_metadata()

        return
Example #4
0
    def _execute_predict(self):
        """
        :return:
        """
        old_lmd = {}
        for k in self.lmd:
            old_lmd[k] = self.lmd[k]

        old_hmd = {}
        for k in self.hmd:
            old_hmd[k] = self.hmd[k]
        self.load_metadata()

        for k in old_lmd:
            if old_lmd[k] is not None:
                self.lmd[k] = old_lmd[k]
            else:
                if k not in self.lmd:
                    self.lmd[k] = None

        for k in old_hmd:
            if old_hmd[k] is not None:
                self.hmd[k] = old_hmd[k]
            else:
                if k not in self.hmd:
                    self.hmd[k] = None

        if self.lmd is None:
            self.log.error('No metadata found for this model')
            return

        self._call_phase_module(clean_exit=True, module_name='DataExtractor')

        if self.input_data.data_frame.shape[0] <= 0:
            self.log.error('No input data provided !')
            return
        if self.lmd['model_is_time_series']:
            self._call_phase_module(clean_exit=True,
                                    module_name='DataSplitter')

        # @TODO Maybe move to a separate "PredictionAnalysis" phase ?
        if self.lmd['run_confidence_variation_analysis']:
            nulled_out_data = []
            nulled_out_columns = []
            for column in self.input_data.columns:
                # Only adapted for a single `when`
                if self.input_data.data_frame.iloc[0][column] is not None:
                    nulled_out_data.append(
                        self.input_data.data_frame.iloc[0].copy())
                    nulled_out_data[-1][column] = None
                    nulled_out_columns.append(column)

            nulled_out_data = pd.DataFrame(nulled_out_data)
            nulled_out_predictions = []

        for mode in ['predict', 'analyze_confidence']:
            if mode == 'analyze_confidence':
                if not self.lmd['run_confidence_variation_analysis']:
                    continue
                else:
                    self.input_data.data_frame = nulled_out_data

            self._call_phase_module(clean_exit=True,
                                    module_name='DataTransformer',
                                    input_data=self.input_data)

            self._call_phase_module(clean_exit=True,
                                    module_name='ModelInterface',
                                    mode='predict')

            output_data = {col: [] for col in self.lmd['columns']}
            evaluations = {}

            for column in self.input_data.columns:
                output_data[column] = list(self.input_data.data_frame[column])

            for predicted_col in self.lmd['predict_columns']:
                output_data[predicted_col] = list(
                    self.hmd['predictions'][predicted_col])
                if f'{predicted_col}_confidences' in self.hmd['predictions']:
                    output_data[
                        f'{predicted_col}_model_confidence'] = self.hmd[
                            'predictions'][f'{predicted_col}_confidences']

                probabilistic_validator = unpickle_obj(
                    self.hmd['probabilistic_validators'][predicted_col])
                confidence_column_name = f'{predicted_col}_confidence'
                output_data[confidence_column_name] = [None] * len(
                    output_data[predicted_col])
                evaluations[predicted_col] = [None] * len(
                    output_data[predicted_col])

                output_data[f'model_{predicted_col}'] = deepcopy(
                    output_data[predicted_col])
                for row_number, predicted_value in enumerate(
                        output_data[predicted_col]):

                    # Compute the feature existance vector
                    input_columns = [
                        col for col in self.input_data.columns
                        if col not in self.lmd['predict_columns']
                    ]
                    features_existance_vector = [
                        False if output_data[col][row_number] is None else True
                        for col in input_columns
                        if col not in self.lmd['columns_to_ignore']
                    ]

                    # Create the probabilsitic evaluation
                    prediction_evaluation = probabilistic_validator.evaluate_prediction_accuracy(
                        features_existence=features_existance_vector,
                        predicted_value=predicted_value)

                    output_data[predicted_col][
                        row_number] = prediction_evaluation.final_value
                    output_data[confidence_column_name][
                        row_number] = prediction_evaluation.most_likely_probability
                    evaluations[predicted_col][
                        row_number] = prediction_evaluation

                if f'{predicted_col}_model_confidence' in output_data:
                    # Scale model confidence between the confidences of the probabilsitic validator
                    mc_arr = np.array(
                        output_data[f'{predicted_col}_model_confidence'])

                    normalized_model_confidences = (
                        (mc_arr - np.min(mc_arr)) /
                        (np.max(mc_arr) - np.min(mc_arr))
                    ) * (np.max(output_data[confidence_column_name]) -
                         np.min(output_data[confidence_column_name])) + np.min(
                             output_data[confidence_column_name])

                    # In case the model confidence is smaller than that yielded after scaling, use the model confidence directly, replaced negative numbers with zero confidence
                    for i in range(
                            len(output_data[
                                f'{predicted_col}_model_confidence'])):
                        if output_data[f'{predicted_col}_model_confidence'][
                                i] < 0:
                            output_data[f'{predicted_col}_model_confidence'][
                                i] = 0
                        output_data[f'{predicted_col}_model_confidence'][
                            i] = min(
                                output_data[
                                    f'{predicted_col}_model_confidence'][i],
                                normalized_model_confidences[i])

            if mode == 'predict':
                self.output_data = PredictTransactionOutputData(
                    transaction=self,
                    data=output_data,
                    evaluations=evaluations)
            else:
                nulled_out_predictions.append(
                    PredictTransactionOutputData(transaction=self,
                                                 data=output_data,
                                                 evaluations=evaluations))

        if self.lmd['run_confidence_variation_analysis']:
            input_confidence_arr = [{}]

            for predicted_col in self.lmd['predict_columns']:
                input_confidence_arr[0][predicted_col] = {
                    'column_names': [],
                    'confidence_variation': []
                }
                actual_confidence = self.output_data[0].explain(
                )[predicted_col][0]['confidence']
                for i in range(len(nulled_out_columns)):
                    nulled_confidence = nulled_out_predictions[0][i].explain(
                    )[predicted_col][0]['confidence']
                    nulled_col_name = nulled_out_columns[i]
                    confidence_variation = actual_confidence - nulled_confidence

                    input_confidence_arr[0][predicted_col][
                        'column_names'].append(nulled_col_name)
                    input_confidence_arr[0][predicted_col][
                        'confidence_variation'].append(confidence_variation)

                input_confidence_arr[0][predicted_col][
                    'confidence_variation_score'] = list(
                        np.interp(
                            input_confidence_arr[0][predicted_col]
                            ['confidence_variation'],
                            (np.min(input_confidence_arr[0][predicted_col]
                                    ['confidence_variation']),
                             np.max(input_confidence_arr[0][predicted_col]
                                    ['confidence_variation'])), (-100, 100)))

            self.output_data.input_confidence_arr = input_confidence_arr

        return
Example #5
0
    def _execute_predict(self):
        """
        :return:
        """
        old_lmd = {}
        for k in self.lmd: old_lmd[k] = self.lmd[k]

        old_hmd = {}
        for k in self.hmd: old_hmd[k] = self.hmd[k]
        with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.lmd['name'] + '_light_model_metadata.pickle'), 'rb') as fp:
            self.lmd = pickle.load(fp)

        with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.hmd['name'] + '_heavy_model_metadata.pickle'), 'rb') as fp:
            self.hmd = pickle.load(fp)

        for k in old_lmd:
            if old_lmd[k] is not None:
                self.lmd[k] = old_lmd[k]
            else:
                if k not in self.lmd:
                    self.lmd[k] = None

        for k in old_hmd:
            if old_hmd[k] is not None:
                self.hmd[k] = old_hmd[k]
            else:
                if k not in self.hmd:
                    self.hmd[k] = None

        if self.lmd is None:
            self.log.error('No metadata found for this model')
            return

        self._call_phase_module('DataExtractor')

        if len(self.input_data.data_array[0]) <= 0:
            self.output_data = self.input_data
            return

        self.output_data = PredictTransactionOutputData(transaction=self)

        if self.lmd['model_backend'] == 'ludwig':
            self.model_backend = LudwigBackend(self)
            predictions = self.model_backend.predict()

        # self.transaction.lmd['predict_columns']
        self.output_data.data = {col: [] for i, col in enumerate(self.input_data.columns)}
        input_columns = [col for col in self.input_data.columns if col not in self.lmd['predict_columns']]

        for row in self.input_data.data_array:
            for index, cell in enumerate(row):
                col = self.input_data.columns[index]
                self.output_data.data[col].append(cell)

        for predicted_col in self.lmd['predict_columns']:
            probabilistic_validator = unpickle_obj(self.hmd['probabilistic_validators'][predicted_col])

            predicted_values = predictions[predicted_col]
            self.output_data.data[predicted_col] = predicted_values
            confidence_column_name = "{col}_confidence".format(col=predicted_col)
            self.output_data.data[confidence_column_name] = [None] * len(predicted_values)
            self.output_data.evaluations[predicted_col] = [None] * len(predicted_values)

            for row_number, predicted_value in enumerate(predicted_values):
                features_existance_vector = [False if self.output_data.data[col][row_number] is None else True for col in input_columns]
                prediction_evaluation = probabilistic_validator.evaluate_prediction_accuracy(features_existence=features_existance_vector, predicted_value=predicted_value)
                self.output_data.data[confidence_column_name][row_number] = prediction_evaluation
                #output_data[col][row_number] = prediction_evaluation.most_likely_value Huh, is this correct, are we replacing the predicted value with the most likely one ? Seems... wrong
                self.output_data.evaluations[predicted_col][row_number] = prediction_evaluation

        with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.lmd['name'] + '_light_model_metadata.pickle'), 'wb') as fp:
            self.lmd['updated_at'] = str(datetime.datetime.now())
            pickle.dump(self.lmd, fp)

        with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.hmd['name'] + '_heavy_model_metadata.pickle'), 'wb') as fp:
            # Don't save data for now
            self.hmd['from_data'] = None
            self.hmd['test_from_data'] = None
            # Don't save data for now
            pickle.dump(self.hmd, fp)

        return