Ejemplo n.º 1
0
    def set_logging_level(logging_level):
        """
        :param logging_level: Set/Update the logging level. Use logging
        constants like `logging.DEBUG` , `logging.INFO` and `logging.ERROR`.

        :return: None
        """
        logging.getLogger('ludwig').setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)
Ejemplo n.º 2
0
    def load(model_dir, logging_level=logging.ERROR):
        """This function allows for loading pretrained models


        # Inputs

        :param model_dir: (string) path to the directory containing the model.
               If the model was trained by the `train` or `experiment` command,
               the model is in `results_dir/experiment_dir/model`.
        :param logging_level: (int, default: `logging.ERROR`) logging level to
               use for logging. Use logging constants like `logging.DEBUG`,
               `logging.INFO` and `logging.ERROR`. By default only errors will
               be printed.


        # Return

        :return: (LudwigModel) a LudwigModel object


        # Example usage

        ```python
        ludwig_model = LudwigModel.load(model_dir)
        ```

        """

        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        model, model_definition = load_model_and_definition(model_dir)
        ludwig_model = LudwigModel(model_definition)
        ludwig_model.model = model
        ludwig_model.train_set_metadata = load_metadata(
            os.path.join(
                model_dir,
                TRAIN_SET_METADATA_FILE_NAME
            )
        )
        return ludwig_model
Ejemplo n.º 3
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        only_predictions=True,
        logging_level=logging.ERROR,
    ):
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logging.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = self.model_definition['input_features']
        if not only_predictions:
            features_to_load += self.model_definition['output_features']
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            ([] if only_predictions else
             self.model_definition['output_features']), [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          [] if only_predictions else
                          self.model_definition['output_features'], None)

        logging.debug('Predicting')
        predict_results = self.model.predict(dataset,
                                             batch_size,
                                             only_predictions=only_predictions,
                                             gpus=gpus,
                                             gpu_fraction=gpu_fraction,
                                             session=getattr(
                                                 self.model, 'session', None))

        if not only_predictions:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logging.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logging.warning('Unrecognized return_type: {}. '
                            'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results
Ejemplo n.º 4
0
    def train_online(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        batch_size=None,
        learning_rate=None,
        regularization_lambda=None,
        dropout_rate=None,
        bucketing_field=None,
        gpus=None,
        gpu_fraction=1,
        logging_level=logging.ERROR,
    ):
        """This function is used to perform one epoch of training of the model 
        on the specified dataset.

        # Inputs

        :param data_df: (DataFrame) dataframe containing data.
        :param data_csv: (string) input data CSV file.
        :param data_dict: (dict) input data dictionary. It is expected to 
               contain one key for each field and the values have to be lists of 
               the same length. Each index in the lists corresponds to one 
               datapoint. For example a data set consisting of two datapoints 
               with a text and a class may be provided as the following dict 
               ``{'text_field_name': ['text of the first datapoint', text of the
               second datapoint'], 'class_filed_name': ['class_datapoints_1', 
               'class_datapoints_2']}`.
        :param batch_size: (int) the batch size to use for training. By default 
               it's the one specified in the model definition.
        :param learning_rate: (float) the learning rate to use for training. By
               default the values is the one specified in the model definition.
        :param regularization_lambda: (float) the regularization lambda
               parameter to use for training. By default the values is the one
               specified in the model definition.
        :param dropout_rate: (float) the dropout rate to use for training. By
               default the values is the one specified in the model definition.
        :param bucketing_field: (string) the bucketing field to use for
               bucketing the data. By default the values is one specified in the
               model definition.
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_fraction: (float, default `1.0`) fraction of GPU memory to
               initialize the process with
        :param logging_level: (int, default: `logging.ERROR`) logging level to
               use for logging. Use logging constants like `logging.DEBUG`,
               `logging.INFO` and `logging.ERROR`. By default only errors will
               be printed.

        There are three ways to provide data: by dataframes using the `data_df`
        parameter, by CSV using the `data_csv` parameter and by dictionary,
        using the `data_dict` parameter.

        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while dict
        approach uses data organized by keys representing columns and values
        that are lists of the datapoints for each. For example a data set
        consisting of two datapoints with a text and a class may be provided as
        the following dict ``{'text_field_name}: ['text of the first datapoint',
        text of the second datapoint'], 'class_filed_name':
        ['class_datapoints_1', 'class_datapoints_2']}`.
        """
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been initialized or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)
            data_df.csv = data_csv

        if batch_size is None:
            batch_size = self.model_definition['training']['batch_size']
        if learning_rate is None:
            learning_rate = self.model_definition['training']['learning_rate']
        if regularization_lambda is None:
            regularization_lambda = self.model_definition['training'][
                'regularization_lambda']
        if dropout_rate is None:
            dropout_rate = self.model_definition['training']['dropout_rate'],
        if bucketing_field is None:
            bucketing_field = self.model_definition['training'][
                'bucketing_field']

        logging.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = (self.model_definition['input_features'] +
                            self.model_definition['output_features'])
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            self.model_definition['output_features'], [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          self.model_definition['output_features'], None)

        logging.debug('Training batch')
        self.model.train_online(dataset,
                                batch_size=batch_size,
                                learning_rate=learning_rate,
                                regularization_lambda=regularization_lambda,
                                dropout_rate=dropout_rate,
                                bucketing_field=bucketing_field,
                                gpus=gpus,
                                gpu_fraction=gpu_fraction)
Ejemplo n.º 5
0
    def initialize_model(self,
                         train_set_metadata=None,
                         train_set_metadata_json=None,
                         gpus=None,
                         gpu_fraction=1,
                         random_seed=default_random_seed,
                         logging_level=logging.ERROR,
                         debug=False,
                         **kwargs):
        """This function initializes a model. It is need for performing online
        learning, so it has to be called before `train_online`.
        `train` initialize the model under the hood, so there is no need to call
        this function if you don't use `train_online`.

        # Inputs

        :param train_set_metadata: (dict) it contains metadata information for
               the input and output features the model is going to be trained
               on. It's the same content of the metadata json file that is
               created while training.
        :param train_set_metadata_json: (string)  path to the JSON metadata file
               created while training. it contains metadata information for the
               input and output features the model is going to be trained on
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_fraction: (float, default `1.0`) fraction of GPU memory to
               initialize the process with
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param logging_level: (int, default: `logging.ERROR`) logging level to
               use for logging. Use logging constants like `logging.DEBUG`,
               `logging.INFO` and `logging.ERROR`. By default only errors will
               be printed.
        :param debug: (bool, default: `False`) enables debugging mode
        """
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if train_set_metadata is None and train_set_metadata_json is None:
            raise ValueError(
                'train_set_metadata or train_set_metadata_json must not None.')
        if train_set_metadata_json is not None:
            train_set_metadata = load_metadata(train_set_metadata_json)

        # update model definition with metadata properties
        update_model_definition_with_metadata(self.model_definition,
                                              train_set_metadata)

        # build model
        model = Model(self.model_definition['input_features'],
                      self.model_definition['output_features'],
                      self.model_definition['combiner'],
                      self.model_definition['training'],
                      self.model_definition['preprocessing'],
                      random_seed=random_seed,
                      debug=debug)
        model.initialize_session(gpus=gpus, gpu_fraction=gpu_fraction)

        # set parameters
        self.model = model
        self.train_set_metadata = train_set_metadata
Ejemplo n.º 6
0
    def train(self,
              data_df=None,
              data_train_df=None,
              data_validation_df=None,
              data_test_df=None,
              data_csv=None,
              data_train_csv=None,
              data_validation_csv=None,
              data_test_csv=None,
              data_hdf5=None,
              data_train_hdf5=None,
              data_validation_hdf5=None,
              data_test_hdf5=None,
              train_set_metadata_json=None,
              dataset_type='generic',
              model_name='run',
              model_load_path=None,
              model_resume_path=None,
              skip_save_model=False,
              skip_save_progress=False,
              skip_save_log=False,
              skip_save_processed_input=False,
              output_directory='results',
              gpus=None,
              gpu_fraction=1.0,
              random_seed=42,
              logging_level=logging.ERROR,
              debug=False,
              **kwargs):
        """This function is used to perform a full training of the model on the 
           specified dataset.

        # Inputs

        :param data_df: (DataFrame) dataframe containing data. If it has a split
               column, it will be used for splitting (0: train, 1: validation,
               2: test), otherwise the dataset will be randomly split
        :param data_train_df: (DataFrame) dataframe containing training data
        :param data_validation_df: (DataFrame) dataframe containing validation
               data
        :param data_test_df: (DataFrame dataframe containing test data
        :param data_csv: (string) input data CSV file. If it has a split column,
               it will be used for splitting (0: train, 1: validation, 2: test),
               otherwise the dataset will be randomly split
        :param data_train_csv: (string) input train data CSV file
        :param data_validation_csv: (string) input validation data CSV file
        :param data_test_csv: (string) input test data CSV file
        :param data_hdf5: (string) input data HDF5 file. It is an intermediate
               preprocess  version of the input CSV created the first time a CSV
               file is used in the same directory with the same name and a hdf5
               extension
        :param data_train_hdf5: (string) input train data HDF5 file. It is an
               intermediate preprocess  version of the input CSV created the
               first time a CSV file is used in the same directory with the same
               name and a hdf5 extension
        :param data_validation_hdf5: (string) input validation data HDF5 file.
               It is an intermediate preprocess version of the input CSV created
               the first time a CSV file is used in the same directory with the
               same name and a hdf5 extension
        :param data_test_hdf5: (string) input test data HDF5 file. It is an
               intermediate preprocess  version of the input CSV created the
               first time a CSV file is used in the same directory with the same
               name and a hdf5 extension
        :param train_set_metadata_json: (string) input metadata JSON file. It is an
               intermediate preprocess file containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a json extension
        :param dataset_type: (string, default: `'default'`) determines the type
               of preprocessing will be applied to the data. Only `generic` is
               available at the moment
        :param model_name: (string) a name for the model, user for the save
               directory
        :param model_load_path: (string) path of a pretrained model to load as
               initialization
        :param model_resume_path: (string) path of a the model directory to
               resume training of
        :param skip_save_model: (bool, default: `False`) disables
               saving model weights and hyperparameters each time the model
               improves. By default Ludwig saves model weights after each epoch
               the validation measure imrpvoes, but if the model is really big
               that can be time consuming if you do not want to keep
               the weights and just find out what performance can a model get
               with a set of hyperparameters, use this parameter to skip it,
               but the model will not be loadable later on.
        :param skip_save_progress: (bool, default: `False`) disables saving
               progress each epoch. By default Ludwig saves weights and stats
               after each epoch for enabling resuming of training, but if
               the model is really big that can be time consuming and will uses
               twice as much space, use this parameter to skip it, but training
               cannot be resumed later on.
        :param skip_save_log: (bool, default: `False`) disables saving TensorBoard
               logs. By default Ludwig saves logs for the TensorBoard, but if it
               is not needed turning it off can slightly increase the
               overall speed.
        :param skip_save_processed_input: (bool, default: `False`) skips saving
               intermediate HDF5 and JSON files
        :param output_directory: (string, default: `'results'`) directory that
               contains the results
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_fraction: (float, default `1.0`) fraction of gpu memory to
               initialize the process with
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode
        :param logging_level: (int, default: `logging.ERROR`) logging level to
               use for logging. Use logging constants like `logging.DEBUG`,
               `logging.INFO` and `logging.ERROR`. By default only errors will
               be printed.

        There are three ways to provide data: by dataframes using the `_df`
        parameters, by CSV using the `_csv` parameters and by HDF5 and JSON,
        using `_hdf5` and `_json` parameters.
        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while HDF5 and
        JSON load previously preprocessed HDF5 and JSON files (they are saved in
        the same directory of the CSV they are obtained from).
        For all three approaches either a full dataset can be provided (which
        will be split randomly according to the split probabilities defined in
        the model definition, by default 70% training, 10% validation and 20%
        test) or, if it contanins a plit column, it will be plit according to
        that column (interpreting 0 as training, 1 as validation and 2 as test).
        Alternatively separated dataframes / CSV / HDF5 files can beprovided
        for each split.

        During training the model and statistics will be saved in a directory
        `[output_dir]/[experiment_name]_[model_name]_n` where all variables are
        resolved to user spiecified ones and `n` is an increasing number
        starting from 0 used to differentiate different runs.


        # Return

        :return: (dict) a dictionary containing training statistics for each
        output feature containing loss and measures values for each epoch.

        """
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        # setup directories and file names
        experiment_dir_name = None
        if model_resume_path is not None:
            if os.path.exists(model_resume_path):
                experiment_dir_name = model_resume_path
            else:
                logging.info('Model resume path does not exists,'
                             ' starting training from scratch')
                model_resume_path = None
        if model_resume_path is None:
            experiment_dir_name = get_experiment_dir_name(
                output_directory, '', model_name)
        description_fn, training_stats_fn, model_dir = get_file_names(
            experiment_dir_name)

        # save description
        description = get_experiment_description(
            self.model_definition,
            dataset_type,
            data_csv=data_csv,
            data_train_csv=data_train_csv,
            data_validation_csv=data_validation_csv,
            data_test_csv=data_test_csv,
            data_hdf5=data_hdf5,
            data_train_hdf5=data_train_hdf5,
            data_validation_hdf5=data_validation_hdf5,
            data_test_hdf5=data_test_hdf5,
            metadata_json=train_set_metadata_json,
            random_seed=random_seed)

        save_json(description_fn, description)

        # print description
        logging.info('Model name: {}'.format(model_name))
        logging.info('Output path: {}'.format(experiment_dir_name))
        logging.info('\n')
        for key, value in description.items():
            logging.info('{0}: {1}'.format(key, pformat(value, indent=4)))
        logging.info('\n')

        # preprocess
        if data_df is not None or data_train_df is not None:
            (training_set, validation_set, test_set,
             train_set_metadata) = preprocess_for_training(
                 self.model_definition,
                 dataset_type,
                 data_df=data_df,
                 data_train_df=data_train_df,
                 data_validation_df=data_validation_df,
                 data_test_df=data_test_df,
                 train_set_metadata_json=train_set_metadata_json,
                 skip_save_processed_input=True,
                 preprocessing_params=self.model_definition['preprocessing'],
                 random_seed=random_seed)
        else:
            (training_set, validation_set, test_set,
             train_set_metadata) = preprocess_for_training(
                 self.model_definition,
                 dataset_type,
                 data_csv=data_csv,
                 data_train_csv=data_train_csv,
                 data_validation_csv=data_validation_csv,
                 data_test_csv=data_test_csv,
                 data_hdf5=data_hdf5,
                 data_train_hdf5=data_train_hdf5,
                 data_validation_hdf5=data_validation_hdf5,
                 data_test_hdf5=data_test_hdf5,
                 train_set_metadata_json=train_set_metadata_json,
                 skip_save_processed_input=skip_save_processed_input,
                 preprocessing_params=self.model_definition['preprocessing'],
                 random_seed=random_seed)

        logging.info('Training set: {0}'.format(training_set.size))
        if validation_set is not None:
            logging.info('Validation set: {0}'.format(validation_set.size))
        if test_set is not None:
            logging.info('Test set: {0}'.format(test_set.size))

        # update model definition with metadata properties
        update_model_definition_with_metadata(self.model_definition,
                                              train_set_metadata)

        if not skip_save_model:
            os.makedirs(model_dir, exist_ok=True)
            train_set_metadata_path = os.path.join(
                model_dir, TRAIN_SET_METADATA_FILE_NAME)
            save_json(train_set_metadata_path, train_set_metadata)

        # run the experiment
        model, result = train(training_set=training_set,
                              validation_set=validation_set,
                              test_set=test_set,
                              model_definition=self.model_definition,
                              save_path=model_dir,
                              model_load_path=model_load_path,
                              resume=model_resume_path is not None,
                              skip_save_model=skip_save_model,
                              skip_save_progress=skip_save_progress,
                              skip_save_log=skip_save_log,
                              gpus=gpus,
                              gpu_fraction=gpu_fraction,
                              random_seed=random_seed,
                              debug=debug)

        train_trainset_stats, train_valisest_stats, train_testset_stats = result
        train_stats = {
            'train': train_trainset_stats,
            'validation': train_valisest_stats,
            'test': train_testset_stats
        }

        # save training and test statistics
        save_json(training_stats_fn, train_stats)

        # grab the results of the model with highest validation test performance
        md_training = self.model_definition['training']
        validation_field = md_training['validation_field']
        validation_measure = md_training['validation_measure']
        validation_field_result = train_valisest_stats[validation_field]

        best_function = get_best_function(validation_measure)

        # print results of the model with highest validation test performance
        if validation_set is not None:
            # max or min depending on the measure
            epoch_best_vali_measure, best_vali_measure = best_function(
                enumerate(validation_field_result[validation_measure]),
                key=lambda pair: pair[1])
            logging.info('Best validation model epoch: {0}'.format(
                epoch_best_vali_measure + 1))
            logging.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_measure, validation_field, best_vali_measure))

            if test_set is not None:
                best_vali_measure_epoch_test_measure = train_testset_stats[
                    validation_field][validation_measure][
                        epoch_best_vali_measure]
                logging.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_measure, validation_field,
                        best_vali_measure_epoch_test_measure))

        logging.info('Finished: {0}'.format(model_name))
        logging.info('Saved to {0}:'.format(experiment_dir_name))

        # set parameters
        self.model = model
        self.train_set_metadata = train_set_metadata

        return train_stats
Ejemplo n.º 7
0
Archivo: api.py Proyecto: oiclid/ludwig
    def train(self,
              data_df=None,
              data_train_df=None,
              data_validation_df=None,
              data_test_df=None,
              data_csv=None,
              data_train_csv=None,
              data_validation_csv=None,
              data_test_csv=None,
              data_hdf5=None,
              data_train_hdf5=None,
              data_validation_hdf5=None,
              data_test_hdf5=None,
              data_dict=None,
              data_train_dict=None,
              data_validation_dict=None,
              data_test_dict=None,
              train_set_metadata_json=None,
              experiment_name='api_experiment',
              model_name='run',
              model_load_path=None,
              model_resume_path=None,
              skip_save_model=False,
              skip_save_progress=False,
              skip_save_log=False,
              skip_save_processed_input=False,
              output_directory='results',
              gpus=None,
              gpu_fraction=1.0,
              use_horovod=False,
              random_seed=42,
              logging_level=logging.ERROR,
              debug=False,
              **kwargs):
        """This function is used to perform a full training of the model on the 
           specified dataset.

        # Inputs

        :param data_df: (DataFrame) dataframe containing data. If it has a split
               column, it will be used for splitting (0: train, 1: validation,
               2: test), otherwise the dataset will be randomly split
        :param data_train_df: (DataFrame) dataframe containing training data
        :param data_validation_df: (DataFrame) dataframe containing validation
               data
        :param data_test_df: (DataFrame dataframe containing test data
        :param data_csv: (string) input data CSV file. If it has a split column,
               it will be used for splitting (0: train, 1: validation, 2: test),
               otherwise the dataset will be randomly split
        :param data_train_csv: (string) input train data CSV file
        :param data_validation_csv: (string) input validation data CSV file
        :param data_test_csv: (string) input test data CSV file
        :param data_hdf5: (string) input data HDF5 file. It is an intermediate
               preprocess  version of the input CSV created the first time a CSV
               file is used in the same directory with the same name and a hdf5
               extension
        :param data_train_hdf5: (string) input train data HDF5 file. It is an
               intermediate preprocess  version of the input CSV created the
               first time a CSV file is used in the same directory with the same
               name and a hdf5 extension
        :param data_validation_hdf5: (string) input validation data HDF5 file.
               It is an intermediate preprocess version of the input CSV created
               the first time a CSV file is used in the same directory with the
               same name and a hdf5 extension
        :param data_test_hdf5: (string) input test data HDF5 file. It is an
               intermediate preprocess  version of the input CSV created the
               first time a CSV file is used in the same directory with the same
               name and a hdf5 extension
        :param data_dict: (dict) input data dictionary. It is expected to
               contain one key for each field and the values have to be lists of
               the same length. Each index in the lists corresponds to one
               datapoint. For example a data set consisting of two datapoints
               with a text and a class may be provided as the following dict
               `{'text_field_name': ['text of the first datapoint', text of the
               second datapoint'], 'class_filed_name': ['class_datapoints_1',
               'class_datapoints_2']}`.
        :param data_train_dict: (dict) input training data dictionary. It is
               expected to contain one key for each field and the values have
               to be lists of the same length. Each index in the lists
               corresponds to one datapoint. For example a data set consisting
               of two datapoints with a text and a class may be provided as the
               following dict:
               `{'text_field_name': ['text of the first datapoint', 'text of the
               second datapoint'], 'class_field_name': ['class_datapoint_1',
               'class_datapoint_2']}`.
        :param data_validation_dict: (dict) input validation data dictionary. It
               is expected to contain one key for each field and the values have
               to be lists of the same length. Each index in the lists
               corresponds to one datapoint. For example a data set consisting
               of two datapoints with a text and a class may be provided as the
               following dict:
               `{'text_field_name': ['text of the first datapoint', 'text of the
               second datapoint'], 'class_field_name': ['class_datapoint_1',
               'class_datapoint_2']}`.
        :param data_test_dict: (dict) input test data dictionary. It is
               expected to contain one key for each field and the values have
               to be lists of the same length. Each index in the lists
               corresponds to one datapoint. For example a data set consisting
               of two datapoints with a text and a class may be provided as the
               following dict:
               `{'text_field_name': ['text of the first datapoint', 'text of the
               second datapoint'], 'class_field_name': ['class_datapoint_1',
               'class_datapoint_2']}`.
        :param train_set_metadata_json: (string) input metadata JSON file. It is an
               intermediate preprocess file containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a json extension
        :param experiment_name: (string) a name for the experiment, used for the save
               directory
        :param model_name: (string) a name for the model, used for the save
               directory
        :param model_load_path: (string) path of a pretrained model to load as
               initialization
        :param model_resume_path: (string) path of a the model directory to
               resume training of
        :param skip_save_model: (bool, default: `False`) disables
               saving model weights and hyperparameters each time the model
               improves. By default Ludwig saves model weights after each epoch
               the validation measure imrpvoes, but if the model is really big
               that can be time consuming if you do not want to keep
               the weights and just find out what performance can a model get
               with a set of hyperparameters, use this parameter to skip it,
               but the model will not be loadable later on.
        :param skip_save_progress: (bool, default: `False`) disables saving
               progress each epoch. By default Ludwig saves weights and stats
               after each epoch for enabling resuming of training, but if
               the model is really big that can be time consuming and will uses
               twice as much space, use this parameter to skip it, but training
               cannot be resumed later on.
        :param skip_save_log: (bool, default: `False`) disables saving TensorBoard
               logs. By default Ludwig saves logs for the TensorBoard, but if it
               is not needed turning it off can slightly increase the
               overall speed.
        :param skip_save_processed_input: (bool, default: `False`) skips saving
               intermediate HDF5 and JSON files
        :param output_directory: (string, default: `'results'`) directory that
               contains the results
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_fraction: (float, default `1.0`) fraction of gpu memory to
               initialize the process with
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode
        :param logging_level: (int, default: `logging.ERROR`) logging level to
               use for logging. Use logging constants like `logging.DEBUG`,
               `logging.INFO` and `logging.ERROR`. By default only errors will
               be printed.

        There are three ways to provide data: by dataframes using the `_df`
        parameters, by CSV using the `_csv` parameters and by HDF5 and JSON,
        using `_hdf5` and `_json` parameters.
        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while HDF5 and
        JSON load previously preprocessed HDF5 and JSON files (they are saved in
        the same directory of the CSV they are obtained from).
        For all three approaches either a full dataset can be provided (which
        will be split randomly according to the split probabilities defined in
        the model definition, by default 70% training, 10% validation and 20%
        test) or, if it contanins a plit column, it will be plit according to
        that column (interpreting 0 as training, 1 as validation and 2 as test).
        Alternatively separated dataframes / CSV / HDF5 files can beprovided
        for each split.

        During training the model and statistics will be saved in a directory
        `[output_dir]/[experiment_name]_[model_name]_n` where all variables are
        resolved to user spiecified ones and `n` is an increasing number
        starting from 0 used to differentiate different runs.


        # Return

        :return: (dict) a dictionary containing training statistics for each
        output feature containing loss and measures values for each epoch.

        """
        logging.getLogger('ludwig').setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if data_df is None and data_dict is not None:
            data_df = pd.DataFrame(data_dict)

        if data_train_df is None and data_train_dict is not None:
            data_train_df = pd.DataFrame(data_train_dict)

        if data_validation_df is None and data_validation_dict is not None:
            data_validation_df = pd.DataFrame(data_validation_dict)

        if data_test_df is None and data_test_dict is not None:
            data_test_df = pd.DataFrame(data_test_dict)

        (self.model, preprocessed_data, self.exp_dir_name, train_stats,
         self.model_definition) = full_train(
             self.model_definition,
             data_df=data_df,
             data_train_df=data_train_df,
             data_validation_df=data_validation_df,
             data_test_df=data_test_df,
             data_csv=data_csv,
             data_train_csv=data_train_csv,
             data_validation_csv=data_validation_csv,
             data_test_csv=data_test_csv,
             data_hdf5=data_hdf5,
             data_train_hdf5=data_train_hdf5,
             data_validation_hdf5=data_validation_hdf5,
             data_test_hdf5=data_test_hdf5,
             train_set_metadata_json=train_set_metadata_json,
             experiment_name=experiment_name,
             model_name=model_name,
             model_load_path=model_load_path,
             model_resume_path=model_resume_path,
             skip_save_model=skip_save_model,
             skip_save_progress=skip_save_progress,
             skip_save_log=skip_save_log,
             skip_save_processed_input=skip_save_processed_input,
             output_directory=output_directory,
             should_close_session=False,
             gpus=gpus,
             gpu_fraction=gpu_fraction,
             use_horovod=use_horovod,
             random_seed=random_seed,
             debug=debug,
         )

        self.train_set_metadata = preprocessed_data[-1]

        return train_stats
Ejemplo n.º 8
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        evaluate_performance=False,
        logging_level=logging.ERROR,
    ):
        logging.getLogger('ludwig').setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results