Esempio n. 1
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        only_predictions=True,
        logging_level=logging.ERROR,
    ):
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logging.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = self.model_definition['input_features']
        if not only_predictions:
            features_to_load += self.model_definition['output_features']
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            ([] if only_predictions else
             self.model_definition['output_features']), [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          [] if only_predictions else
                          self.model_definition['output_features'], None)

        logging.debug('Predicting')
        predict_results = self.model.predict(dataset,
                                             batch_size,
                                             only_predictions=only_predictions,
                                             gpus=gpus,
                                             gpu_fraction=gpu_fraction,
                                             session=getattr(
                                                 self.model, 'session', None))

        if not only_predictions:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logging.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logging.warning('Unrecognized return_type: {}. '
                            'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results
Esempio n. 2
0
    def train_online(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        batch_size=None,
        learning_rate=None,
        regularization_lambda=None,
        dropout_rate=None,
        bucketing_field=None,
        gpus=None,
        gpu_fraction=1,
        logging_level=logging.ERROR,
    ):
        """This function is used to perform one epoch of training of the model 
        on the specified dataset.

        # Inputs

        :param data_df: (DataFrame) dataframe containing data.
        :param data_csv: (string) input data CSV file.
        :param data_dict: (dict) input data dictionary. It is expected to 
               contain one key for each field and the values have to be lists of 
               the same length. Each index in the lists corresponds to one 
               datapoint. For example a data set consisting of two datapoints 
               with a text and a class may be provided as the following dict 
               ``{'text_field_name': ['text of the first datapoint', text of the
               second datapoint'], 'class_filed_name': ['class_datapoints_1', 
               'class_datapoints_2']}`.
        :param batch_size: (int) the batch size to use for training. By default 
               it's the one specified in the model definition.
        :param learning_rate: (float) the learning rate to use for training. By
               default the values is the one specified in the model definition.
        :param regularization_lambda: (float) the regularization lambda
               parameter to use for training. By default the values is the one
               specified in the model definition.
        :param dropout_rate: (float) the dropout rate to use for training. By
               default the values is the one specified in the model definition.
        :param bucketing_field: (string) the bucketing field to use for
               bucketing the data. By default the values is one specified in the
               model definition.
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_fraction: (float, default `1.0`) fraction of GPU memory to
               initialize the process with
        :param logging_level: (int, default: `logging.ERROR`) logging level to
               use for logging. Use logging constants like `logging.DEBUG`,
               `logging.INFO` and `logging.ERROR`. By default only errors will
               be printed.

        There are three ways to provide data: by dataframes using the `data_df`
        parameter, by CSV using the `data_csv` parameter and by dictionary,
        using the `data_dict` parameter.

        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while dict
        approach uses data organized by keys representing columns and values
        that are lists of the datapoints for each. For example a data set
        consisting of two datapoints with a text and a class may be provided as
        the following dict ``{'text_field_name}: ['text of the first datapoint',
        text of the second datapoint'], 'class_filed_name':
        ['class_datapoints_1', 'class_datapoints_2']}`.
        """
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been initialized or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)
            data_df.csv = data_csv

        if batch_size is None:
            batch_size = self.model_definition['training']['batch_size']
        if learning_rate is None:
            learning_rate = self.model_definition['training']['learning_rate']
        if regularization_lambda is None:
            regularization_lambda = self.model_definition['training'][
                'regularization_lambda']
        if dropout_rate is None:
            dropout_rate = self.model_definition['training']['dropout_rate'],
        if bucketing_field is None:
            bucketing_field = self.model_definition['training'][
                'bucketing_field']

        logging.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = (self.model_definition['input_features'] +
                            self.model_definition['output_features'])
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            self.model_definition['output_features'], [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          self.model_definition['output_features'], None)

        logging.debug('Training batch')
        self.model.train_online(dataset,
                                batch_size=batch_size,
                                learning_rate=learning_rate,
                                regularization_lambda=regularization_lambda,
                                dropout_rate=dropout_rate,
                                bucketing_field=bucketing_field,
                                gpus=gpus,
                                gpu_fraction=gpu_fraction)
Esempio n. 3
0
    def train_online(self,
                     data_df=None,
                     data_csv=None,
                     data_dict=None,
                     batch_size=None,
                     learning_rate=None,
                     regularization_lambda=None,
                     bucketing_field=None):
        """This function is used to perform one epoch of training of the model
        on the specified dataset.

        # Inputs

        :param data_df: (DataFrame) dataframe containing data.
        :param data_csv: (string) input data CSV file.
        :param data_dict: (dict) input data dictionary. It is expected to
               contain one key for each field and the values have to be lists of
               the same length. Each index in the lists corresponds to one
               datapoint. For example a data set consisting of two datapoints
               with a text and a class may be provided as the following dict
               ``{'text_field_name': ['text of the first datapoint', text of the
               second datapoint'], 'class_filed_name': ['class_datapoints_1',
               'class_datapoints_2']}`.
        :param batch_size: (int) the batch size to use for training. By default
               it's the one specified in the model definition.
        :param learning_rate: (float) the learning rate to use for training. By
               default the values is the one specified in the model definition.
        :param regularization_lambda: (float) the regularization lambda
               parameter to use for training. By default the values is the one
               specified in the model definition.
        :param dropout: (float) the dropout rate to use for training. By
               default the values is the one specified in the model definition.
        :param bucketing_field: (string) the bucketing field to use for
               bucketing the data. By default the values is one specified in the
               model definition.

        There are three ways to provide data: by dataframes using the `data_df`
        parameter, by CSV using the `data_csv` parameter and by dictionary,
        using the `data_dict` parameter.

        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while dict
        approach uses data organized by keys representing columns and values
        that are lists of the datapoints for each. For example a data set
        consisting of two datapoints with a text and a class may be provided as
        the following dict ``{'text_field_name}: ['text of the first datapoint',
        text of the second datapoint'], 'class_filed_name':
        ['class_datapoints_1', 'class_datapoints_2']}`.
        """

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been initialized or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)
            data_df.csv = data_csv

        if batch_size is None:
            batch_size = self.model_definition[TRAINING]['batch_size']
        if learning_rate is None:
            learning_rate = self.model_definition[TRAINING]['learning_rate']
        if regularization_lambda is None:
            regularization_lambda = self.model_definition[TRAINING][
                'regularization_lambda']
        if bucketing_field is None:
            bucketing_field = self.model_definition[TRAINING][
                'bucketing_field']

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = (self.model_definition['input_features'] +
                            self.model_definition['output_features'])
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            self.model_definition['output_features'], [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          self.model_definition['output_features'], None)

        logger.debug('Training batch')
        self.model.train_online(dataset,
                                batch_size=batch_size,
                                learning_rate=learning_rate,
                                regularization_lambda=regularization_lambda,
                                bucketing_field=bucketing_field)
Esempio n. 4
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        evaluate_performance=False,
        skip_save_unprocessed_output=False,
        gpus=None,
        gpu_fraction=1,
    ):

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        num_overrides = override_in_memory_flag(
            self.model_definition['input_features'], True)
        if num_overrides > 0:
            logger.warning(
                'Using in_memory = False is not supported for Ludwig API.')

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )

        return postprocessed_predictions, predict_results
Esempio n. 5
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        evaluate_performance=False,
        logging_level=logging.ERROR,
    ):
        logging.getLogger('ludwig').setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results