def _execute_predict(self): """ :return: """ old_lmd = {} for k in self.lmd: old_lmd[k] = self.lmd[k] old_hmd = {} for k in self.hmd: old_hmd[k] = self.hmd[k] self.load_metadata() for k in old_lmd: if old_lmd[k] is not None: self.lmd[k] = old_lmd[k] else: if k not in self.lmd: self.lmd[k] = None for k in old_hmd: if old_hmd[k] is not None: self.hmd[k] = old_hmd[k] else: if k not in self.hmd: self.hmd[k] = None if self.lmd is None: self.log.error('No metadata found for this model') return self._call_phase_module(clean_exit=True, module_name='DataExtractor') if self.input_data.data_frame.shape[0] <= 0: self.log.error('No input data provided !') return if self.lmd['model_is_time_series']: self._call_phase_module(clean_exit=True, module_name='DataSplitter') # @TODO Maybe move to a separate "PredictionAnalysis" phase ? if self.lmd['run_confidence_variation_analysis']: nulled_out_data = [] nulled_out_columns = [] for column in self.input_data.columns: # Only adapted for a single `when` if self.input_data.data_frame.iloc[0][column] is not None: nulled_out_data.append( self.input_data.data_frame.iloc[0].copy()) nulled_out_data[-1][column] = None nulled_out_columns.append(column) nulled_out_data = pd.DataFrame(nulled_out_data) for mode in ['predict', 'analyze_confidence']: if mode == 'analyze_confidence': if not self.lmd['run_confidence_variation_analysis']: continue else: self.input_data.data_frame = nulled_out_data self._call_phase_module(clean_exit=True, module_name='DataTransformer', input_data=self.input_data) self._call_phase_module(clean_exit=True, module_name='ModelInterface', mode='predict') output_data = {col: [] for col in self.lmd['columns']} for column in self.input_data.columns: output_data[column] = list(self.input_data.data_frame[column]) for predicted_col in self.lmd['predict_columns']: output_data[predicted_col] = list( self.hmd['predictions'][predicted_col]) for extra_column in [ f'{predicted_col}_model_confidence', f'{predicted_col}_confidence_range' ]: if extra_column in self.hmd['predictions']: output_data[extra_column] = self.hmd['predictions'][ extra_column] probabilistic_validator = unpickle_obj( self.hmd['probabilistic_validators'][predicted_col]) output_data[f'{predicted_col}_confidence'] = [None] * len( output_data[predicted_col]) output_data[f'model_{predicted_col}'] = deepcopy( output_data[predicted_col]) for row_number, predicted_value in enumerate( output_data[predicted_col]): # Compute the feature existance vector input_columns = [ col for col in self.input_data.columns if col not in self.lmd['predict_columns'] ] features_existance_vector = [ False if str(output_data[col][row_number]) in ('None', 'nan', '', 'Nan', 'NAN', 'NaN') else True for col in input_columns if col not in self.lmd['columns_to_ignore'] ] # Create the probabilsitic evaluation probability_true_prediction = probabilistic_validator.evaluate_prediction_accuracy( features_existence=features_existance_vector, predicted_value=predicted_value) output_data[f'{predicted_col}_confidence'][ row_number] = probability_true_prediction if mode == 'predict': self.output_data = PredictTransactionOutputData( transaction=self, data=output_data) else: nulled_out_predictions = PredictTransactionOutputData( transaction=self, data=output_data) if self.lmd['run_confidence_variation_analysis']: input_confidence = {} extra_insights = {} for predicted_col in self.lmd['predict_columns']: input_confidence[predicted_col] = {} extra_insights[predicted_col] = {'if_missing': []} actual_confidence = self.output_data[0].explanation[ predicted_col]['confidence'] for i, nulled_col_name in enumerate(nulled_out_columns): nulled_out_predicted_value = nulled_out_predictions[ i].explanation[predicted_col]['predicted_value'] nulled_confidence = nulled_out_predictions[i].explanation[ predicted_col]['confidence'] print(actual_confidence - nulled_confidence, actual_confidence, nulled_confidence) confidence_variation = actual_confidence - nulled_confidence input_confidence[predicted_col][nulled_col_name] = round( confidence_variation, 3) extra_insights[predicted_col]['if_missing'].append( {nulled_col_name: nulled_out_predicted_value}) self.output_data.input_confidence = input_confidence self.output_data.extra_insights = extra_insights return
def _execute_predict(self): """ :return: """ old_lmd = {} for k in self.lmd: old_lmd[k] = self.lmd[k] old_hmd = {} for k in self.hmd: old_hmd[k] = self.hmd[k] self.load_metadata() for k in old_lmd: if old_lmd[k] is not None: self.lmd[k] = old_lmd[k] else: if k not in self.lmd: self.lmd[k] = None for k in old_hmd: if old_hmd[k] is not None: self.hmd[k] = old_hmd[k] else: if k not in self.hmd: self.hmd[k] = None if self.lmd is None: self.log.error('No metadata found for this model') return self._call_phase_module(clean_exit=True, module_name='DataExtractor') if self.input_data.data_frame.shape[0] <= 0: self.log.error('No input data provided !') return self._call_phase_module(clean_exit=True, module_name='DataTransformer', input_data=self.input_data) self._call_phase_module(clean_exit=True, module_name='ModelInterface', mode='predict') output_data = {col: [] for col in self.lmd['columns']} evaluations = {} for column in self.input_data.data_frame.columns: output_data[column] = list(self.input_data.data_frame[column]) for predicted_col in self.lmd['predict_columns']: output_data[predicted_col] = list( self.hmd['predictions'][predicted_col]) probabilistic_validator = unpickle_obj( self.hmd['probabilistic_validators'][predicted_col]) confidence_column_name = f'{predicted_col}_confidence' output_data[confidence_column_name] = [None] * len( output_data[predicted_col]) evaluations[predicted_col] = [None] * len( output_data[predicted_col]) for row_number, predicted_value in enumerate( output_data[predicted_col]): # Compute the feature existance vector input_columns = [ col for col in self.input_data.columns if col not in self.lmd['predict_columns'] ] features_existance_vector = [ False if output_data[col][row_number] is None else True for col in input_columns if col not in self.lmd['malformed_columns']['names'] ] # Create the probabilsitic evaluation prediction_evaluation = probabilistic_validator.evaluate_prediction_accuracy( features_existence=features_existance_vector, predicted_value=predicted_value) if type(prediction_evaluation) == float: output_data[confidence_column_name][ row_number] = prediction_evaluation evaluations[predicted_col][row_number] = None else: output_data[confidence_column_name][ row_number] = prediction_evaluation.most_likely_probability evaluations[predicted_col][ row_number] = prediction_evaluation self.output_data = PredictTransactionOutputData( transaction=self, data=output_data, evaluations=evaluations) return
def _execute_predict(self): """ :return: """ old_lmd = {} for k in self.lmd: old_lmd[k] = self.lmd[k] old_hmd = {} for k in self.hmd: old_hmd[k] = self.hmd[k] self.load_metadata() for k in old_lmd: if old_lmd[k] is not None: self.lmd[k] = old_lmd[k] else: if k not in self.lmd: self.lmd[k] = None for k in old_hmd: if old_hmd[k] is not None: self.hmd[k] = old_hmd[k] else: if k not in self.hmd: self.hmd[k] = None if self.lmd is None: self.log.error('No metadata found for this model') return self._call_phase_module(clean_exit=True, module_name='DataExtractor') if self.input_data.data_frame.shape[0] <= 0: self.output_data = self.input_data return self.output_data = PredictTransactionOutputData(transaction=self) self._call_phase_module(clean_exit=True, module_name='ModelInterface', mode='predict') self.output_data.data = {col: [] for i, col in enumerate(self.input_data.columns)} input_columns = [col for col in self.input_data.columns if col not in self.lmd['predict_columns']] for i, row in self.input_data.data_frame.iterrows(): for index, cell in enumerate(row): col = self.input_data.columns[index] self.output_data.data[col].append(cell) for predicted_col in self.lmd['predict_columns']: probabilistic_validator = unpickle_obj(self.hmd['probabilistic_validators'][predicted_col]) predicted_values = self.hmd['predictions'][predicted_col] self.output_data.data[predicted_col] = predicted_values confidence_column_name = "{col}_confidence".format(col=predicted_col) self.output_data.data[confidence_column_name] = [None] * len(predicted_values) self.output_data.evaluations[predicted_col] = [None] * len(predicted_values) for row_number, predicted_value in enumerate(predicted_values): features_existance_vector = [False if self.output_data.data[col][row_number] is None else True for col in input_columns if col not in self.lmd['malformed_columns']['names']] prediction_evaluation = probabilistic_validator.evaluate_prediction_accuracy(features_existence=features_existance_vector, predicted_value=predicted_value) self.output_data.data[confidence_column_name][row_number] = prediction_evaluation #output_data[col][row_number] = prediction_evaluation.most_likely_value Huh, is this correct, are we replacing the predicted value with the most likely one ? Seems... wrong self.output_data.evaluations[predicted_col][row_number] = prediction_evaluation #self.save_metadata() return
def _execute_predict(self): """ :return: """ old_lmd = {} for k in self.lmd: old_lmd[k] = self.lmd[k] old_hmd = {} for k in self.hmd: old_hmd[k] = self.hmd[k] self.load_metadata() for k in old_lmd: if old_lmd[k] is not None: self.lmd[k] = old_lmd[k] else: if k not in self.lmd: self.lmd[k] = None for k in old_hmd: if old_hmd[k] is not None: self.hmd[k] = old_hmd[k] else: if k not in self.hmd: self.hmd[k] = None if self.lmd is None: self.log.error('No metadata found for this model') return self._call_phase_module(clean_exit=True, module_name='DataExtractor') if self.input_data.data_frame.shape[0] <= 0: self.log.error('No input data provided !') return if self.lmd['model_is_time_series']: self._call_phase_module(clean_exit=True, module_name='DataSplitter') # @TODO Maybe move to a separate "PredictionAnalysis" phase ? if self.lmd['run_confidence_variation_analysis']: nulled_out_data = [] nulled_out_columns = [] for column in self.input_data.columns: # Only adapted for a single `when` if self.input_data.data_frame.iloc[0][column] is not None: nulled_out_data.append( self.input_data.data_frame.iloc[0].copy()) nulled_out_data[-1][column] = None nulled_out_columns.append(column) nulled_out_data = pd.DataFrame(nulled_out_data) nulled_out_predictions = [] for mode in ['predict', 'analyze_confidence']: if mode == 'analyze_confidence': if not self.lmd['run_confidence_variation_analysis']: continue else: self.input_data.data_frame = nulled_out_data self._call_phase_module(clean_exit=True, module_name='DataTransformer', input_data=self.input_data) self._call_phase_module(clean_exit=True, module_name='ModelInterface', mode='predict') output_data = {col: [] for col in self.lmd['columns']} evaluations = {} for column in self.input_data.columns: output_data[column] = list(self.input_data.data_frame[column]) for predicted_col in self.lmd['predict_columns']: output_data[predicted_col] = list( self.hmd['predictions'][predicted_col]) if f'{predicted_col}_confidences' in self.hmd['predictions']: output_data[ f'{predicted_col}_model_confidence'] = self.hmd[ 'predictions'][f'{predicted_col}_confidences'] probabilistic_validator = unpickle_obj( self.hmd['probabilistic_validators'][predicted_col]) confidence_column_name = f'{predicted_col}_confidence' output_data[confidence_column_name] = [None] * len( output_data[predicted_col]) evaluations[predicted_col] = [None] * len( output_data[predicted_col]) output_data[f'model_{predicted_col}'] = deepcopy( output_data[predicted_col]) for row_number, predicted_value in enumerate( output_data[predicted_col]): # Compute the feature existance vector input_columns = [ col for col in self.input_data.columns if col not in self.lmd['predict_columns'] ] features_existance_vector = [ False if output_data[col][row_number] is None else True for col in input_columns if col not in self.lmd['columns_to_ignore'] ] # Create the probabilsitic evaluation prediction_evaluation = probabilistic_validator.evaluate_prediction_accuracy( features_existence=features_existance_vector, predicted_value=predicted_value) output_data[predicted_col][ row_number] = prediction_evaluation.final_value output_data[confidence_column_name][ row_number] = prediction_evaluation.most_likely_probability evaluations[predicted_col][ row_number] = prediction_evaluation if f'{predicted_col}_model_confidence' in output_data: # Scale model confidence between the confidences of the probabilsitic validator mc_arr = np.array( output_data[f'{predicted_col}_model_confidence']) normalized_model_confidences = ( (mc_arr - np.min(mc_arr)) / (np.max(mc_arr) - np.min(mc_arr)) ) * (np.max(output_data[confidence_column_name]) - np.min(output_data[confidence_column_name])) + np.min( output_data[confidence_column_name]) # In case the model confidence is smaller than that yielded after scaling, use the model confidence directly, replaced negative numbers with zero confidence for i in range( len(output_data[ f'{predicted_col}_model_confidence'])): if output_data[f'{predicted_col}_model_confidence'][ i] < 0: output_data[f'{predicted_col}_model_confidence'][ i] = 0 output_data[f'{predicted_col}_model_confidence'][ i] = min( output_data[ f'{predicted_col}_model_confidence'][i], normalized_model_confidences[i]) if mode == 'predict': self.output_data = PredictTransactionOutputData( transaction=self, data=output_data, evaluations=evaluations) else: nulled_out_predictions.append( PredictTransactionOutputData(transaction=self, data=output_data, evaluations=evaluations)) if self.lmd['run_confidence_variation_analysis']: input_confidence_arr = [{}] for predicted_col in self.lmd['predict_columns']: input_confidence_arr[0][predicted_col] = { 'column_names': [], 'confidence_variation': [] } actual_confidence = self.output_data[0].explain( )[predicted_col][0]['confidence'] for i in range(len(nulled_out_columns)): nulled_confidence = nulled_out_predictions[0][i].explain( )[predicted_col][0]['confidence'] nulled_col_name = nulled_out_columns[i] confidence_variation = actual_confidence - nulled_confidence input_confidence_arr[0][predicted_col][ 'column_names'].append(nulled_col_name) input_confidence_arr[0][predicted_col][ 'confidence_variation'].append(confidence_variation) input_confidence_arr[0][predicted_col][ 'confidence_variation_score'] = list( np.interp( input_confidence_arr[0][predicted_col] ['confidence_variation'], (np.min(input_confidence_arr[0][predicted_col] ['confidence_variation']), np.max(input_confidence_arr[0][predicted_col] ['confidence_variation'])), (-100, 100))) self.output_data.input_confidence_arr = input_confidence_arr return
def _execute_predict(self): """ :return: """ old_lmd = {} for k in self.lmd: old_lmd[k] = self.lmd[k] old_hmd = {} for k in self.hmd: old_hmd[k] = self.hmd[k] with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.lmd['name'] + '_light_model_metadata.pickle'), 'rb') as fp: self.lmd = pickle.load(fp) with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.hmd['name'] + '_heavy_model_metadata.pickle'), 'rb') as fp: self.hmd = pickle.load(fp) for k in old_lmd: if old_lmd[k] is not None: self.lmd[k] = old_lmd[k] else: if k not in self.lmd: self.lmd[k] = None for k in old_hmd: if old_hmd[k] is not None: self.hmd[k] = old_hmd[k] else: if k not in self.hmd: self.hmd[k] = None if self.lmd is None: self.log.error('No metadata found for this model') return self._call_phase_module('DataExtractor') if len(self.input_data.data_array[0]) <= 0: self.output_data = self.input_data return self.output_data = PredictTransactionOutputData(transaction=self) if self.lmd['model_backend'] == 'ludwig': self.model_backend = LudwigBackend(self) predictions = self.model_backend.predict() # self.transaction.lmd['predict_columns'] self.output_data.data = {col: [] for i, col in enumerate(self.input_data.columns)} input_columns = [col for col in self.input_data.columns if col not in self.lmd['predict_columns']] for row in self.input_data.data_array: for index, cell in enumerate(row): col = self.input_data.columns[index] self.output_data.data[col].append(cell) for predicted_col in self.lmd['predict_columns']: probabilistic_validator = unpickle_obj(self.hmd['probabilistic_validators'][predicted_col]) predicted_values = predictions[predicted_col] self.output_data.data[predicted_col] = predicted_values confidence_column_name = "{col}_confidence".format(col=predicted_col) self.output_data.data[confidence_column_name] = [None] * len(predicted_values) self.output_data.evaluations[predicted_col] = [None] * len(predicted_values) for row_number, predicted_value in enumerate(predicted_values): features_existance_vector = [False if self.output_data.data[col][row_number] is None else True for col in input_columns] prediction_evaluation = probabilistic_validator.evaluate_prediction_accuracy(features_existence=features_existance_vector, predicted_value=predicted_value) self.output_data.data[confidence_column_name][row_number] = prediction_evaluation #output_data[col][row_number] = prediction_evaluation.most_likely_value Huh, is this correct, are we replacing the predicted value with the most likely one ? Seems... wrong self.output_data.evaluations[predicted_col][row_number] = prediction_evaluation with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.lmd['name'] + '_light_model_metadata.pickle'), 'wb') as fp: self.lmd['updated_at'] = str(datetime.datetime.now()) pickle.dump(self.lmd, fp) with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.hmd['name'] + '_heavy_model_metadata.pickle'), 'wb') as fp: # Don't save data for now self.hmd['from_data'] = None self.hmd['test_from_data'] = None # Don't save data for now pickle.dump(self.hmd, fp) return