Beispiel #1
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        evaluate_performance=False,
        skip_save_unprocessed_output=False,
        gpus=None,
        gpu_fraction=1,
    ):

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        num_overrides = override_in_memory_flag(
            self.model_definition['input_features'], True)
        if num_overrides > 0:
            logger.warning(
                'Using in_memory = False is not supported for Ludwig API.')

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )

        return postprocessed_predictions, predict_results
Beispiel #2
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        only_predictions=True,
        logging_level=logging.ERROR,
    ):
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logging.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = self.model_definition['input_features']
        if not only_predictions:
            features_to_load += self.model_definition['output_features']
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            ([] if only_predictions else
             self.model_definition['output_features']), [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          [] if only_predictions else
                          self.model_definition['output_features'], None)

        logging.debug('Predicting')
        predict_results = self.model.predict(dataset,
                                             batch_size,
                                             only_predictions=only_predictions,
                                             gpus=gpus,
                                             gpu_fraction=gpu_fraction,
                                             session=getattr(
                                                 self.model, 'session', None))

        if not only_predictions:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logging.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logging.warning('Unrecognized return_type: {}. '
                            'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results
Beispiel #3
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        evaluate_performance=False,
        logging_level=logging.ERROR,
    ):
        logging.getLogger('ludwig').setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results