def _predict( self, data_df=None, data_csv=None, data_dict=None, return_type=pd.DataFrame, batch_size=128, evaluate_performance=False, skip_save_unprocessed_output=False, gpus=None, gpu_fraction=1, ): if (self.model is None or self.model_definition is None or self.train_set_metadata is None): raise ValueError('Model has not been trained or loaded') if data_df is None: data_df = self._read_data(data_csv, data_dict) logger.debug('Preprocessing {} datapoints'.format(len(data_df))) # Added [:] to next line, before I was just assigning, # this way I'm copying the list. If you don't do it, you are actually # modifying the input feature list when you add output features, # which you definitely don't want to do features_to_load = self.model_definition['input_features'][:] if evaluate_performance: output_features = self.model_definition['output_features'] else: output_features = [] features_to_load += output_features num_overrides = override_in_memory_flag( self.model_definition['input_features'], True) if num_overrides > 0: logger.warning( 'Using in_memory = False is not supported for Ludwig API.') preprocessed_data = build_data(data_df, features_to_load, self.train_set_metadata, self.model_definition['preprocessing']) replace_text_feature_level(features_to_load, [preprocessed_data]) dataset = Dataset(preprocessed_data, self.model_definition['input_features'], output_features, None) logger.debug('Predicting') predict_results = self.model.predict( dataset, batch_size, evaluate_performance=evaluate_performance, gpus=gpus, gpu_fraction=gpu_fraction, session=getattr(self.model, 'session', None)) if evaluate_performance: calculate_overall_stats(predict_results, self.model_definition['output_features'], dataset, self.train_set_metadata) logger.debug('Postprocessing') if (return_type == 'dict' or return_type == 'dictionary' or return_type == dict): postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata, experiment_dir_name=self.exp_dir_name, skip_save_unprocessed_output=skip_save_unprocessed_output, ) elif (return_type == 'dataframe' or return_type == 'df' or return_type == pd.DataFrame): postprocessed_predictions = postprocess_df( predict_results, self.model_definition['output_features'], self.train_set_metadata, experiment_dir_name=self.exp_dir_name, skip_save_unprocessed_output=skip_save_unprocessed_output, ) else: logger.warning('Unrecognized return_type: {}. ' 'Returning DataFrame.'.format(return_type)) postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata, experiment_dir_name=self.exp_dir_name, skip_save_unprocessed_output=skip_save_unprocessed_output, ) return postprocessed_predictions, predict_results
def _predict( self, data_df=None, data_csv=None, data_dict=None, return_type=pd.DataFrame, batch_size=128, gpus=None, gpu_fraction=1, only_predictions=True, logging_level=logging.ERROR, ): logging.getLogger().setLevel(logging_level) if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}: set_disable_progressbar(True) if (self.model is None or self.model_definition is None or self.train_set_metadata is None): raise ValueError('Model has not been trained or loaded') if data_df is None: data_df = self._read_data(data_csv, data_dict) logging.debug('Preprocessing {} datapoints'.format(len(data_df))) features_to_load = self.model_definition['input_features'] if not only_predictions: features_to_load += self.model_definition['output_features'] preprocessed_data = build_data(data_df, features_to_load, self.train_set_metadata, self.model_definition['preprocessing']) replace_text_feature_level( self.model_definition['input_features'] + ([] if only_predictions else self.model_definition['output_features']), [preprocessed_data]) dataset = Dataset(preprocessed_data, self.model_definition['input_features'], [] if only_predictions else self.model_definition['output_features'], None) logging.debug('Predicting') predict_results = self.model.predict(dataset, batch_size, only_predictions=only_predictions, gpus=gpus, gpu_fraction=gpu_fraction, session=getattr( self.model, 'session', None)) if not only_predictions: calculate_overall_stats(predict_results, self.model_definition['output_features'], dataset, self.train_set_metadata) logging.debug('Postprocessing') if (return_type == 'dict' or return_type == 'dictionary' or return_type == dict): postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata) elif (return_type == 'dataframe' or return_type == 'df' or return_type == pd.DataFrame): postprocessed_predictions = postprocess_df( predict_results, self.model_definition['output_features'], self.train_set_metadata) else: logging.warning('Unrecognized return_type: {}. ' 'Returning DataFrame.'.format(return_type)) postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata) return postprocessed_predictions, predict_results
def _predict( self, data_df=None, data_csv=None, data_dict=None, return_type=pd.DataFrame, batch_size=128, gpus=None, gpu_fraction=1, evaluate_performance=False, logging_level=logging.ERROR, ): logging.getLogger('ludwig').setLevel(logging_level) if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}: set_disable_progressbar(True) if (self.model is None or self.model_definition is None or self.train_set_metadata is None): raise ValueError('Model has not been trained or loaded') if data_df is None: data_df = self._read_data(data_csv, data_dict) logger.debug('Preprocessing {} datapoints'.format(len(data_df))) # Added [:] to next line, before I was just assigning, # this way I'm copying the list. If you don't do it, you are actually # modifying the input feature list when you add output features, # which you definitely don't want to do features_to_load = self.model_definition['input_features'][:] if evaluate_performance: output_features = self.model_definition['output_features'] else: output_features = [] features_to_load += output_features preprocessed_data = build_data(data_df, features_to_load, self.train_set_metadata, self.model_definition['preprocessing']) replace_text_feature_level(features_to_load, [preprocessed_data]) dataset = Dataset(preprocessed_data, self.model_definition['input_features'], output_features, None) logger.debug('Predicting') predict_results = self.model.predict( dataset, batch_size, evaluate_performance=evaluate_performance, gpus=gpus, gpu_fraction=gpu_fraction, session=getattr(self.model, 'session', None)) if evaluate_performance: calculate_overall_stats(predict_results, self.model_definition['output_features'], dataset, self.train_set_metadata) logger.debug('Postprocessing') if (return_type == 'dict' or return_type == 'dictionary' or return_type == dict): postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata) elif (return_type == 'dataframe' or return_type == 'df' or return_type == pd.DataFrame): postprocessed_predictions = postprocess_df( predict_results, self.model_definition['output_features'], self.train_set_metadata) else: logger.warning('Unrecognized return_type: {}. ' 'Returning DataFrame.'.format(return_type)) postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata) return postprocessed_predictions, predict_results