Example #1
0
    def train_online(self,
                     dataset,
                     training_set_metadata=None,
                     data_format='auto',
                     random_seed=default_random_seed,
                     debug=False):
        """Performs one epoch of training of the model on `dataset`.

        :param dataset: (string, dict, DataFrame) source containing the training dataset.
        :param training_set_metadata: (string, dict) metadata JSON file or loaded metadata.
               Intermediate preprocess structure containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a '.json' extension.
        :param data_format: (string) format to interpret data sources. Will be inferred
               automatically if not specified.
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode

        """
        training_set_metadata = training_set_metadata or self.training_set_metadata
        training_dataset, _, _, training_set_metadata = preprocess_for_training(
            self.model_definition,
            training_set=dataset,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=True,
            preprocessing_params=self.model_definition[PREPROCESSING],
            random_seed=random_seed
        )

        if not self.training_set_metadata:
            self.training_set_metadata = training_set_metadata

        if not self.model:
            update_model_definition_with_metadata(
                self.model_definition,
                training_set_metadata
            )
            self.model = LudwigModel.create_model(self.model_definition,
                                                  random_seed=random_seed)

        if not self._online_trainer:
            self._online_trainer = Trainer(
                **self.model_definition[TRAINING],
                random_seed=random_seed,
                horoovd=self._horovod,
                debug=debug
            )

        self._online_trainer.train_online(
            self.model,
            training_dataset,
        )
Example #2
0
 def create_trainer(self, **kwargs):
     return Trainer(**kwargs)
Example #3
0
 def create_trainer(self, **kwargs):
     return Trainer(horovod=self._horovod, **kwargs)
Example #4
0
def experiment(
        model_definition,
        model_definition_file=None,
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        experiment_name='experiment',
        model_name='run',
        model_load_path=None,
        model_resume_path=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=False,
        skip_save_unprocessed_output=False,  # skipcq: PYL-W0613
        skip_save_test_predictions=False,  # skipcq: PYL-W0613
        skip_save_test_statistics=False,  # skipcq: PYL-W0613
        output_directory='results',
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        use_horovod=None,
        random_seed=default_random_seed,
        debug=False,
        **kwargs):
    (model, preprocessed_data, experiment_dir_name, train_stats,
     model_definition) = full_train(
         model_definition,
         model_definition_file=model_definition_file,
         data_df=data_df,
         data_train_df=data_train_df,
         data_validation_df=data_validation_df,
         data_test_df=data_test_df,
         data_csv=data_csv,
         data_train_csv=data_train_csv,
         data_validation_csv=data_validation_csv,
         data_test_csv=data_test_csv,
         data_hdf5=data_hdf5,
         data_train_hdf5=data_train_hdf5,
         data_validation_hdf5=data_validation_hdf5,
         data_test_hdf5=data_test_hdf5,
         train_set_metadata_json=train_set_metadata_json,
         experiment_name=experiment_name,
         model_name=model_name,
         model_load_path=model_load_path,
         model_resume_path=model_resume_path,
         skip_save_training_description=skip_save_training_description,
         skip_save_training_statistics=skip_save_training_statistics,
         skip_save_model=skip_save_model,
         skip_save_progress=skip_save_progress,
         skip_save_log=skip_save_log,
         skip_save_processed_input=skip_save_processed_input,
         output_directory=output_directory,
         gpus=gpus,
         gpu_memory_limit=gpu_memory_limit,
         allow_parallel_threads=allow_parallel_threads,
         use_horovod=use_horovod,
         random_seed=random_seed,
         debug=debug,
         **kwargs)

    (
        _,  # training_set
        _,  # validation_set
        test_set,
        train_set_metadata) = preprocessed_data

    if test_set is not None:
        if model_definition[TRAINING]['eval_batch_size'] > 0:
            batch_size = model_definition[TRAINING]['eval_batch_size']
        else:
            batch_size = model_definition[TRAINING]['batch_size']

        # if a model was saved on disk, reload it
        model_dir = os.path.join(experiment_dir_name, 'model')
        if is_model_dir(model_dir):
            model = Trainer.load(model_dir,
                                 use_horovod=use_horovod,
                                 gpus=gpus,
                                 gpu_memory_limit=gpu_memory_limit,
                                 allow_parallel_threads=allow_parallel_threads)

        # predict
        test_results = predict(test_set,
                               train_set_metadata,
                               model,
                               model_definition,
                               batch_size,
                               evaluate_performance=True,
                               debug=debug)
    else:
        test_results = None

    return (model, preprocessed_data, experiment_dir_name, train_stats,
            model_definition, test_results)
Example #5
0
    def initialize_model(self,
                         train_set_metadata=None,
                         train_set_metadata_json=None,
                         gpus=None,
                         gpu_memory_limit=None,
                         allow_parallel_threads=True,
                         random_seed=default_random_seed,
                         debug=False,
                         **kwargs):
        """This function initializes a model. It is need for performing online
        learning, so it has to be called before `train_online`.
        `train` initialize the model under the hood, so there is no need to call
        this function if you don't use `train_online`.

        # Inputs

        :param train_set_metadata: (dict) it contains metadata information for
               the input and output features the model is going to be trained
               on. It's the same content of the metadata json file that is
               created while training.
        :param train_set_metadata_json: (string)  path to the JSON metadata file
               created while training. it contains metadata information for the
               input and output features the model is going to be trained on
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate
               per GPU device.
        :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use
               multithreading parallelism to improve performance at the cost of
               determinism.
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode
        """

        if train_set_metadata is None and train_set_metadata_json is None:
            raise ValueError(
                'train_set_metadata or train_set_metadata_json must not None.')
        if train_set_metadata_json is not None:
            train_set_metadata = load_metadata(train_set_metadata_json)

        # update model definition with metadata properties
        update_model_definition_with_metadata(self.model_definition,
                                              train_set_metadata)

        # build model
        model = Trainer(self.model_definition['input_features'],
                        self.model_definition['output_features'],
                        self.model_definition['combiner'],
                        self.model_definition[TRAINING],
                        self.model_definition['preprocessing'],
                        gpus=gpus,
                        gpu_memory_limit=gpu_memory_limit,
                        allow_parallel_threads=allow_parallel_threads,
                        random_seed=random_seed,
                        debug=debug)

        # set parameters
        self.model = model
        self.train_set_metadata = train_set_metadata
Example #6
0
    def create_trainer(self, **kwargs):
        from ludwig.models.trainer import Trainer

        return Trainer(**kwargs)
Example #7
0
class LudwigModel:
    def __init__(self,
                 model_definition,
                 logging_level=logging.ERROR,
                 use_horovod=None,
                 gpus=None,
                 gpu_memory_limit=None,
                 allow_parallel_threads=True):
        """
        :param model_definition: (dict, string) in-memory representation of model definition
               or string path to the saved JSON model definition file.
        :param logging_level: Log level that will be sent to stderr.
        :param use_horovod: (bool) use Horovod for distributed training. Will be set
               automatically if `horovodrun` is used to launch the training script.
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate
              per GPU device.
        :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use
               multithreading parallelism to improve performance at the cost of
               determinism.
        """
        # check if model definition is a path or a dict
        if isinstance(model_definition, str):  # assume path
            with open(model_definition, 'r') as def_file:
                model_definition_dict = yaml.safe_load(def_file)
            self.model_definition_fp = model_definition
        else:
            model_definition_dict = copy.deepcopy(model_definition)
            self.model_definition_fp = None

        # merge model definition with defaults
        self.model_definition = merge_with_defaults(model_definition_dict)

        # setup horovod
        self._horovod = configure_horovod(use_horovod)

        # setup logging
        self.set_logging_level(logging_level)

        # setup TensorFlow
        initialize_tensorflow(gpus, gpu_memory_limit, allow_parallel_threads,
                              self._horovod)

        # setup model
        self.model = None
        self.training_set_metadata = None

        # online training state
        self._online_trainer = None

    def train(
            self,
            dataset=None,
            training_set=None,
            validation_set=None,
            test_set=None,
            training_set_metadata=None,
            data_format=None,
            experiment_name='api_experiment',
            model_name='run',
            model_resume_path=None,
            skip_save_training_description=False,
            skip_save_training_statistics=False,
            skip_save_model=False,
            skip_save_progress=False,
            skip_save_log=False,
            skip_save_processed_input=False,
            output_directory='results',
            random_seed=default_random_seed,
            debug=False,
            **kwargs
    ):
        """This function is used to perform a full training of the model on the
           specified dataset.

        # Inputs

        :param dataset: (string, dict, DataFrame) source containing the entire dataset.
               If it has a split column, it will be used for splitting (0: train,
               1: validation, 2: test), otherwise the dataset will be randomly split.
        :param training_set: (string, dict, DataFrame) source containing training data.
        :param validation_set: (string, dict, DataFrame) source containing validation data.
        :param test_set: (string, dict, DataFrame) source containing test data.
        :param training_set_metadata: (string, dict) metadata JSON file or loaded metadata.
               Intermediate preprocess structure containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a '.json' extension.
        :param data_format: (string) format to interpret data sources. Will be inferred
               automatically if not specified.
        :param experiment_name: (string) a name for the experiment, used for the save
               directory
        :param model_name: (string) a name for the model, used for the save
               directory
        :param model_resume_path: (string) path of a the model directory to
               resume training of
        :param skip_save_training_description: (bool, default: `False`) disables
               saving the description JSON file.
        :param skip_save_training_statistics: (bool, default: `False`) disables
               saving training statistics JSON file.
        :param skip_save_model: (bool, default: `False`) disables
               saving model weights and hyperparameters each time the model
               improves. By default Ludwig saves model weights after each epoch
               the validation metric imrpvoes, but if the model is really big
               that can be time consuming if you do not want to keep
               the weights and just find out what performance can a model get
               with a set of hyperparameters, use this parameter to skip it,
               but the model will not be loadable later on.
        :param skip_save_progress: (bool, default: `False`) disables saving
               progress each epoch. By default Ludwig saves weights and stats
               after each epoch for enabling resuming of training, but if
               the model is really big that can be time consuming and will uses
               twice as much space, use this parameter to skip it, but training
               cannot be resumed later on.
        :param skip_save_log: (bool, default: `False`) disables saving TensorBoard
               logs. By default Ludwig saves logs for the TensorBoard, but if it
               is not needed turning it off can slightly increase the
               overall speed.
        :param skip_save_processed_input: (bool, default: `False`) skips saving
               intermediate HDF5 and JSON files
        :param output_directory: (string, default: `'results'`) directory that
               contains the results
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode

        There are three ways to provide data: by dataframes using the `_df`
        parameters, by CSV using the `_csv` parameters and by HDF5 and JSON,
        using `_hdf5` and `_json` parameters.
        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while HDF5 and
        JSON load previously preprocessed HDF5 and JSON files (they are saved in
        the same directory of the CSV they are obtained from).
        For all three approaches either a full dataset can be provided (which
        will be split randomly according to the split probabilities defined in
        the model definition, by default 70% training, 10% validation and 20%
        test) or, if it contanins a plit column, it will be plit according to
        that column (interpreting 0 as training, 1 as validation and 2 as test).
        Alternatively separated dataframes / CSV / HDF5 files can beprovided
        for each split.

        During training the model and statistics will be saved in a directory
        `[output_dir]/[experiment_name]_[model_name]_n` where all variables are
        resolved to user spiecified ones and `n` is an increasing number
        starting from 0 used to differentiate different runs.


        # Return

        :return: ((dict, DataFrame)) tuple containing:
            - A dictionary of training statistics for each output feature containing
              loss and metrics values for each epoch. The second return
            - A Pandas DataFrame of preprocessed training data.
        """
        # setup directories and file names
        if model_resume_path is not None:
            if os.path.exists(model_resume_path):
                output_directory = model_resume_path
            else:
                if is_on_master():
                    logger.info(
                        'Model resume path does not exists, '
                        'starting training from scratch'
                    )
                model_resume_path = None

        if model_resume_path is None:
            if is_on_master():
                output_directory = get_output_directory(
                    output_directory,
                    experiment_name,
                    model_name
                )
            else:
                output_directory = None

        # if we are skipping all saving,
        # there is no need to create a directory that will remain empty
        should_create_output_directory = not (
                skip_save_training_description and
                skip_save_training_statistics and
                skip_save_model and
                skip_save_progress and
                skip_save_log and
                skip_save_processed_input
        )

        description_fn = training_stats_fn = model_dir = None
        if is_on_master():
            if should_create_output_directory:
                if not os.path.exists(output_directory):
                    os.makedirs(output_directory, exist_ok=True)
            description_fn, training_stats_fn, model_dir = get_file_names(
                output_directory)

        # save description
        if is_on_master():
            description = get_experiment_description(
                self.model_definition,
                dataset=dataset,
                training_set=training_set,
                validation_set=validation_set,
                test_set=test_set,
                training_set_metadata=training_set_metadata,
                data_format=data_format,
                random_seed=random_seed
            )
            if not skip_save_training_description:
                save_json(description_fn, description)
            # print description
            logger.info('Experiment name: {}'.format(experiment_name))
            logger.info('Model name: {}'.format(model_name))
            logger.info('Output directory: {}'.format(output_directory))
            logger.info('\n')
            for key, value in description.items():
                logger.info('{}: {}'.format(key, pformat(value, indent=4)))
            logger.info('\n')

        # preprocess
        preprocessed_data = preprocess_for_training(
            self.model_definition,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=skip_save_processed_input,
            preprocessing_params=self.model_definition[PREPROCESSING],
            random_seed=random_seed
        )

        (training_set,
         validation_set,
         test_set,
         training_set_metadata) = preprocessed_data
        self.training_set_metadata = training_set_metadata

        if is_on_master():
            logger.info('Training set: {0}'.format(training_set.size))
            if validation_set is not None:
                logger.info('Validation set: {0}'.format(validation_set.size))
            if test_set is not None:
                logger.info('Test set: {0}'.format(test_set.size))

        if is_on_master():
            if not skip_save_model:
                # save train set metadata
                os.makedirs(model_dir, exist_ok=True)
                save_json(
                    os.path.join(
                        model_dir,
                        TRAIN_SET_METADATA_FILE_NAME
                    ),
                    training_set_metadata
                )

        contrib_command("train_init", experiment_directory=output_directory,
                        experiment_name=experiment_name, model_name=model_name,
                        output_directory=output_directory,
                        resume=model_resume_path is not None)

        # Build model if not provided
        # if it was provided it means it was already loaded
        if not self.model:
            if is_on_master():
                print_boxed('MODEL', print_fun=logger.debug)
            # update model definition with metadata properties
            update_model_definition_with_metadata(
                self.model_definition,
                training_set_metadata
            )
            self.model = LudwigModel.create_model(self.model_definition,
                                                  random_seed=random_seed)

        # init trainer
        trainer = Trainer(
            **self.model_definition[TRAINING],
            resume=model_resume_path is not None,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            random_seed=random_seed,
            horoovd=self._horovod,
            debug=debug
        )

        contrib_command("train_model", self.model, self.model_definition,
                        self.model_definition_fp)

        # train model
        if is_on_master():
            print_boxed('TRAINING')
            if not skip_save_model:
                self.save_model_definition(model_dir)

        train_stats = trainer.train(
            self.model,
            training_set,
            validation_set=validation_set,
            test_set=test_set,
            save_path=model_dir,
        )

        train_trainset_stats, train_valiset_stats, train_testset_stats = train_stats
        train_stats = {
            TRAINING: train_trainset_stats,
            VALIDATION: train_valiset_stats,
            TEST: train_testset_stats
        }

        # save training statistics
        if is_on_master():
            if not skip_save_training_statistics:
                save_json(training_stats_fn, train_stats)

        # grab the results of the model with highest validation test performance
        validation_field = trainer.validation_field
        validation_metric = trainer.validation_metric
        validation_field_result = train_valiset_stats[validation_field]

        best_function = get_best_function(validation_metric)
        # results of the model with highest validation test performance
        if is_on_master() and validation_set is not None:
            epoch_best_vali_metric, best_vali_metric = best_function(
                enumerate(validation_field_result[validation_metric]),
                key=lambda pair: pair[1]
            )
            logger.info(
                'Best validation model epoch: {0}'.format(
                    epoch_best_vali_metric + 1)
            )
            logger.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_metric, validation_field, best_vali_metric
                ))
            if test_set is not None:
                best_vali_metric_epoch_test_metric = train_testset_stats[
                    validation_field][validation_metric][
                    epoch_best_vali_metric]

                logger.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_metric,
                        validation_field,
                        best_vali_metric_epoch_test_metric
                    )
                )
            logger.info(
                '\nFinished: {0}_{1}'.format(experiment_name, model_name))
            logger.info('Saved to: {0}'.format(output_directory))

        contrib_command("train_save", output_directory)

        self.training_set_metadata = training_set_metadata

        if not skip_save_model:
            # Load the best weights from saved checkpoint
            self.load_weights(model_dir)

        return train_stats, preprocessed_data, output_directory

    def train_online(self,
                     dataset,
                     training_set_metadata=None,
                     data_format='auto',
                     random_seed=default_random_seed,
                     debug=False):
        """Performs one epoch of training of the model on `dataset`.

        :param dataset: (string, dict, DataFrame) source containing the training dataset.
        :param training_set_metadata: (string, dict) metadata JSON file or loaded metadata.
               Intermediate preprocess structure containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a '.json' extension.
        :param data_format: (string) format to interpret data sources. Will be inferred
               automatically if not specified.
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode

        """
        training_set_metadata = training_set_metadata or self.training_set_metadata
        training_dataset, _, _, training_set_metadata = preprocess_for_training(
            self.model_definition,
            training_set=dataset,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=True,
            preprocessing_params=self.model_definition[PREPROCESSING],
            random_seed=random_seed
        )

        if not self.training_set_metadata:
            self.training_set_metadata = training_set_metadata

        if not self.model:
            update_model_definition_with_metadata(
                self.model_definition,
                training_set_metadata
            )
            self.model = LudwigModel.create_model(self.model_definition,
                                                  random_seed=random_seed)

        if not self._online_trainer:
            self._online_trainer = Trainer(
                **self.model_definition[TRAINING],
                random_seed=random_seed,
                horoovd=self._horovod,
                debug=debug
            )

        self._online_trainer.train_online(
            self.model,
            training_dataset,
        )

    def predict(
            self,
            dataset=None,
            data_format=None,
            batch_size=128,
            skip_save_unprocessed_output=True,
            skip_save_predictions=True,
            output_directory='results',
            return_type=pd.DataFrame,
            debug=False,
            **kwargs
    ):
        self._check_initialization()

        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=False,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        predictions = predictor.batch_predict(
            self.model,
            dataset,
        )

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (
                    skip_save_unprocessed_output and skip_save_predictions
            )
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        logger.debug('Postprocessing')
        postproc_predictions = convert_predictions(
            postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                                             or not is_on_master(),
            ),
            self.model.output_features,
            self.training_set_metadata,
            return_type=return_type
        )

        if is_on_master():
            if not skip_save_predictions:
                save_prediction_outputs(postproc_predictions,
                                        output_directory)

                logger.info('Saved to: {0}'.format(output_directory))

        return postproc_predictions, output_directory

    # def evaluate_pseudo(self, data, return_preds=False):
    #     preproc_data = preprocess_data(data)
    #     if return_preds:
    #         eval_stats, preds = self.model.batch_evaluate(
    #             preproc_data, return_preds=return_preds
    #         )
    #         postproc_preds = postprocess_data(preds)
    #         return eval_stats, postproc_preds
    #     else:
    #         eval_stats = self.model.batch_evaluate(
    #             preproc_data, return_preds=return_preds
    #         )
    #         return eval_stats

    def evaluate(
            self,
            dataset=None,
            data_format=None,
            batch_size=128,
            skip_save_unprocessed_output=True,
            skip_save_predictions=True,
            skip_save_eval_stats=True,
            collect_predictions=False,
            collect_overall_stats=False,
            output_directory='results',
            return_type=pd.DataFrame,
            debug=False,
            **kwargs
    ):
        self._check_initialization()

        logger.debug('Preprocessing')

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=True,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        stats, predictions = predictor.batch_evaluation(
            self.model,
            dataset,
            collect_predictions=collect_predictions or collect_overall_stats,
        )

        # calculate the overall metrics
        if collect_overall_stats:
            overall_stats = calculate_overall_stats(
                self.model.output_features,
                predictions,
                dataset,
                training_set_metadata
            )
            stats = {of_name: {**stats[of_name], **overall_stats[of_name]}
            # account for presence of 'combined' key
            if of_name in overall_stats else {**stats[of_name]}
                     for of_name in stats}

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (
                    skip_save_unprocessed_output and
                    skip_save_predictions and
                    skip_save_eval_stats
            )
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        if collect_predictions:
            logger.debug('Postprocessing')
            postproc_predictions = postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                                             or not is_on_master(),
            )
        else:
            postproc_predictions = predictions  # = {}

        if is_on_master():
            if postproc_predictions is not None and not skip_save_predictions:
                save_prediction_outputs(postproc_predictions,
                                        output_directory)

            print_evaluation_stats(stats)
            if not skip_save_eval_stats:
                save_evaluation_stats(stats, output_directory)

            if not skip_save_predictions or not skip_save_eval_stats:
                logger.info('Saved to: {0}'.format(output_directory))

        if collect_predictions:
            postproc_predictions = convert_predictions(
                postproc_predictions,
                self.model.output_features,
                self.training_set_metadata,
                return_type=return_type)

        return stats, postproc_predictions, output_directory

    def experiment(
            self,
            dataset=None,
            training_set=None,
            validation_set=None,
            test_set=None,
            training_set_metadata=None,
            data_format=None,
            experiment_name='experiment',
            model_name='run',
            model_load_path=None,
            model_resume_path=None,
            skip_save_training_description=False,
            skip_save_training_statistics=False,
            skip_save_model=False,
            skip_save_progress=False,
            skip_save_log=False,
            skip_save_processed_input=False,
            skip_save_unprocessed_output=False,
            skip_save_predictions=False,
            skip_save_eval_stats=False,
            skip_collect_predictions=False,
            skip_collect_overall_stats=False,
            output_directory='results',
            gpus=None,
            gpu_memory_limit=None,
            allow_parallel_threads=True,
            use_horovod=None,
            random_seed=default_random_seed,
            debug=False,
            **kwargs
    ):
        (
            train_stats,
            preprocessed_data,
            output_directory
        ) = self.train(
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            experiment_name=experiment_name,
            model_name=model_name,
            model_load_path=model_load_path,
            model_resume_path=model_resume_path,
            skip_save_training_description=skip_save_training_description,
            skip_save_training_statistics=skip_save_training_statistics,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            skip_save_processed_input=skip_save_processed_input,
            skip_save_unprocessed_output=skip_save_unprocessed_output,
            output_directory=output_directory,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            use_horovod=use_horovod,
            random_seed=random_seed,
            debug=debug,
        )

        (_,  # training_set
         _,  # validation_set
         test_set,
         training_set_metadata) = preprocessed_data

        if test_set is not None:
            if self.model_definition[TRAINING]['eval_batch_size'] > 0:
                batch_size = self.model_definition[TRAINING]['eval_batch_size']
            else:
                batch_size = self.model_definition[TRAINING]['batch_size']

            # predict
            test_results, _, _ = self.evaluate(
                test_set,
                data_format=data_format,
                batch_size=batch_size,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
                skip_save_predictions=skip_save_predictions,
                skip_save_eval_stats=skip_save_eval_stats,
                collect_predictions=not skip_collect_predictions,
                collect_overall_stats=not skip_collect_overall_stats,
                debug=debug
            )
        else:
            test_results = None

        return test_results, train_stats, preprocessed_data, output_directory

    def collect_weights(
            self,
            tensor_names=None,
            **kwargs
    ):
        self._check_initialization()
        collected_tensors = self.model.collect_weights(tensor_names)
        return collected_tensors

    def collect_activations(
            self,
            layer_names,
            dataset,
            data_format=None,
            batch_size=128,
            # output_directory='results',
            debug=False,
            **kwargs
    ):
        self._check_initialization()
        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=False,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        activations = predictor.batch_collect_activations(
            self.model,
            layer_names,
            dataset,
        )

        return activations

    @staticmethod
    def load(model_dir,
             logging_level=logging.ERROR,
             use_horovod=None,
             gpus=None,
             gpu_memory_limit=None,
             allow_parallel_threads=True):
        """This function allows for loading pretrained models

        # Inputs

        :param logging_level: Log level that will be sent to stderr.
        :param use_horovod: (bool) use Horovod for distributed training. Will be set
               automatically if `horovodrun` is used to launch the training script.
        :param model_dir: (string) path to the directory containing the model.
               If the model was trained by the `train` or `experiment` command,
               the model is in `results_dir/experiment_dir/model`.
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate
              per GPU device.
        :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use
               multithreading parallelism to improve performance at the cost of
               determinism.

        # Return

        :return: (LudwigModel) a LudwigModel object


        # Example usage

        ```python
        ludwig_model = LudwigModel.load(model_dir)
        ```

        """
        horovod = configure_horovod(use_horovod)
        model_definition = broadcast_return(lambda: load_json(os.path.join(
            model_dir,
            MODEL_HYPERPARAMETERS_FILE_NAME
        )), horovod)

        # initialize model
        ludwig_model = LudwigModel(
            model_definition,
            logging_level=logging_level,
            use_horovod=use_horovod,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
        )

        # generate model from definition
        ludwig_model.model = LudwigModel.create_model(model_definition)

        # load model weights
        ludwig_model.load_weights(model_dir)

        # load train set metadata
        ludwig_model.training_set_metadata = broadcast_return(
            lambda: load_metadata(
                os.path.join(
                    model_dir,
                    TRAIN_SET_METADATA_FILE_NAME
                )
            ), horovod
        )

        return ludwig_model

    def load_weights(self, model_dir):
        if is_on_master():
            weights_save_path = os.path.join(
                model_dir,
                MODEL_WEIGHTS_FILE_NAME
            )
            self.model.load_weights(weights_save_path)

        if self._horovod:
            # Model weights are only saved on master, so broadcast
            # to all other ranks
            self._horovod.broadcast_variables(self.model.variables,
                                              root_rank=0)

    def save(self, save_path):
        """This function allows to save models on disk

        # Inputs

        :param  save_path: (string) path to the directory where the model is
                going to be saved. Both a JSON file containing the model
                architecture hyperparameters and checkpoints files containing
                model weights will be saved.


        # Example usage

        ```python
        ludwig_model.save(save_path)
        ```

        """
        self._check_initialization()

        # save model definition
        self.save_model_definition(save_path)

        # save model weights
        model_weights_path = os.path.join(save_path, MODEL_WEIGHTS_FILE_NAME)
        self.model.save_weights(model_weights_path)

        # save training set metadata
        training_set_metadata_path = os.path.join(
            save_path,
            TRAIN_SET_METADATA_FILE_NAME
        )
        save_json(training_set_metadata_path, self.training_set_metadata)

    def save_model_definition(self, save_path):
        os.makedirs(save_path, exist_ok=True)
        model_hyperparameters_path = os.path.join(
            save_path,
            MODEL_HYPERPARAMETERS_FILE_NAME
        )
        save_json(model_hyperparameters_path, self.model_definition)

    def save_savedmodel(self, save_path):
        """This function allows to save models on disk

        # Inputs

        :param  save_path: (string) path to the directory where the SavedModel
                is going to be saved.


        # Example usage

        ```python
        ludwig_model.save_for_serving(save_path)
        ```

        """
        self._check_initialization()
        self.model.save_savedmodel(save_path)

    def _check_initialization(self):
        if self.model is None or \
                self.model_definition is None or \
                self.training_set_metadata is None:
            raise ValueError('Model has not been trained or loaded')

    @staticmethod
    def create_model(model_definition, random_seed=default_random_seed):
        # todo: support loading other model types based on definition
        return ECD(
            input_features_def=model_definition['input_features'],
            combiner_def=model_definition['combiner'],
            output_features_def=model_definition['output_features'],
            random_seed=random_seed,
        )

    @staticmethod
    def set_logging_level(logging_level):
        """
        :param logging_level: Set/Update the logging level. Use logging
        constants like `logging.DEBUG` , `logging.INFO` and `logging.ERROR`.

        :return: None
        """
        logging.getLogger('ludwig').setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)
        else:
            set_disable_progressbar(False)
Example #8
0
    def train(
            self,
            dataset=None,
            training_set=None,
            validation_set=None,
            test_set=None,
            training_set_metadata=None,
            data_format=None,
            experiment_name='api_experiment',
            model_name='run',
            model_resume_path=None,
            skip_save_training_description=False,
            skip_save_training_statistics=False,
            skip_save_model=False,
            skip_save_progress=False,
            skip_save_log=False,
            skip_save_processed_input=False,
            output_directory='results',
            random_seed=default_random_seed,
            debug=False,
            **kwargs
    ):
        """This function is used to perform a full training of the model on the
           specified dataset.

        # Inputs

        :param dataset: (string, dict, DataFrame) source containing the entire dataset.
               If it has a split column, it will be used for splitting (0: train,
               1: validation, 2: test), otherwise the dataset will be randomly split.
        :param training_set: (string, dict, DataFrame) source containing training data.
        :param validation_set: (string, dict, DataFrame) source containing validation data.
        :param test_set: (string, dict, DataFrame) source containing test data.
        :param training_set_metadata: (string, dict) metadata JSON file or loaded metadata.
               Intermediate preprocess structure containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a '.json' extension.
        :param data_format: (string) format to interpret data sources. Will be inferred
               automatically if not specified.
        :param experiment_name: (string) a name for the experiment, used for the save
               directory
        :param model_name: (string) a name for the model, used for the save
               directory
        :param model_resume_path: (string) path of a the model directory to
               resume training of
        :param skip_save_training_description: (bool, default: `False`) disables
               saving the description JSON file.
        :param skip_save_training_statistics: (bool, default: `False`) disables
               saving training statistics JSON file.
        :param skip_save_model: (bool, default: `False`) disables
               saving model weights and hyperparameters each time the model
               improves. By default Ludwig saves model weights after each epoch
               the validation metric imrpvoes, but if the model is really big
               that can be time consuming if you do not want to keep
               the weights and just find out what performance can a model get
               with a set of hyperparameters, use this parameter to skip it,
               but the model will not be loadable later on.
        :param skip_save_progress: (bool, default: `False`) disables saving
               progress each epoch. By default Ludwig saves weights and stats
               after each epoch for enabling resuming of training, but if
               the model is really big that can be time consuming and will uses
               twice as much space, use this parameter to skip it, but training
               cannot be resumed later on.
        :param skip_save_log: (bool, default: `False`) disables saving TensorBoard
               logs. By default Ludwig saves logs for the TensorBoard, but if it
               is not needed turning it off can slightly increase the
               overall speed.
        :param skip_save_processed_input: (bool, default: `False`) skips saving
               intermediate HDF5 and JSON files
        :param output_directory: (string, default: `'results'`) directory that
               contains the results
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode

        There are three ways to provide data: by dataframes using the `_df`
        parameters, by CSV using the `_csv` parameters and by HDF5 and JSON,
        using `_hdf5` and `_json` parameters.
        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while HDF5 and
        JSON load previously preprocessed HDF5 and JSON files (they are saved in
        the same directory of the CSV they are obtained from).
        For all three approaches either a full dataset can be provided (which
        will be split randomly according to the split probabilities defined in
        the model definition, by default 70% training, 10% validation and 20%
        test) or, if it contanins a plit column, it will be plit according to
        that column (interpreting 0 as training, 1 as validation and 2 as test).
        Alternatively separated dataframes / CSV / HDF5 files can beprovided
        for each split.

        During training the model and statistics will be saved in a directory
        `[output_dir]/[experiment_name]_[model_name]_n` where all variables are
        resolved to user spiecified ones and `n` is an increasing number
        starting from 0 used to differentiate different runs.


        # Return

        :return: ((dict, DataFrame)) tuple containing:
            - A dictionary of training statistics for each output feature containing
              loss and metrics values for each epoch. The second return
            - A Pandas DataFrame of preprocessed training data.
        """
        # setup directories and file names
        if model_resume_path is not None:
            if os.path.exists(model_resume_path):
                output_directory = model_resume_path
            else:
                if is_on_master():
                    logger.info(
                        'Model resume path does not exists, '
                        'starting training from scratch'
                    )
                model_resume_path = None

        if model_resume_path is None:
            if is_on_master():
                output_directory = get_output_directory(
                    output_directory,
                    experiment_name,
                    model_name
                )
            else:
                output_directory = None

        # if we are skipping all saving,
        # there is no need to create a directory that will remain empty
        should_create_output_directory = not (
                skip_save_training_description and
                skip_save_training_statistics and
                skip_save_model and
                skip_save_progress and
                skip_save_log and
                skip_save_processed_input
        )

        description_fn = training_stats_fn = model_dir = None
        if is_on_master():
            if should_create_output_directory:
                if not os.path.exists(output_directory):
                    os.makedirs(output_directory, exist_ok=True)
            description_fn, training_stats_fn, model_dir = get_file_names(
                output_directory)

        # save description
        if is_on_master():
            description = get_experiment_description(
                self.model_definition,
                dataset=dataset,
                training_set=training_set,
                validation_set=validation_set,
                test_set=test_set,
                training_set_metadata=training_set_metadata,
                data_format=data_format,
                random_seed=random_seed
            )
            if not skip_save_training_description:
                save_json(description_fn, description)
            # print description
            logger.info('Experiment name: {}'.format(experiment_name))
            logger.info('Model name: {}'.format(model_name))
            logger.info('Output directory: {}'.format(output_directory))
            logger.info('\n')
            for key, value in description.items():
                logger.info('{}: {}'.format(key, pformat(value, indent=4)))
            logger.info('\n')

        # preprocess
        preprocessed_data = preprocess_for_training(
            self.model_definition,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=skip_save_processed_input,
            preprocessing_params=self.model_definition[PREPROCESSING],
            random_seed=random_seed
        )

        (training_set,
         validation_set,
         test_set,
         training_set_metadata) = preprocessed_data
        self.training_set_metadata = training_set_metadata

        if is_on_master():
            logger.info('Training set: {0}'.format(training_set.size))
            if validation_set is not None:
                logger.info('Validation set: {0}'.format(validation_set.size))
            if test_set is not None:
                logger.info('Test set: {0}'.format(test_set.size))

        if is_on_master():
            if not skip_save_model:
                # save train set metadata
                os.makedirs(model_dir, exist_ok=True)
                save_json(
                    os.path.join(
                        model_dir,
                        TRAIN_SET_METADATA_FILE_NAME
                    ),
                    training_set_metadata
                )

        contrib_command("train_init", experiment_directory=output_directory,
                        experiment_name=experiment_name, model_name=model_name,
                        output_directory=output_directory,
                        resume=model_resume_path is not None)

        # Build model if not provided
        # if it was provided it means it was already loaded
        if not self.model:
            if is_on_master():
                print_boxed('MODEL', print_fun=logger.debug)
            # update model definition with metadata properties
            update_model_definition_with_metadata(
                self.model_definition,
                training_set_metadata
            )
            self.model = LudwigModel.create_model(self.model_definition,
                                                  random_seed=random_seed)

        # init trainer
        trainer = Trainer(
            **self.model_definition[TRAINING],
            resume=model_resume_path is not None,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            random_seed=random_seed,
            horoovd=self._horovod,
            debug=debug
        )

        contrib_command("train_model", self.model, self.model_definition,
                        self.model_definition_fp)

        # train model
        if is_on_master():
            print_boxed('TRAINING')
            if not skip_save_model:
                self.save_model_definition(model_dir)

        train_stats = trainer.train(
            self.model,
            training_set,
            validation_set=validation_set,
            test_set=test_set,
            save_path=model_dir,
        )

        train_trainset_stats, train_valiset_stats, train_testset_stats = train_stats
        train_stats = {
            TRAINING: train_trainset_stats,
            VALIDATION: train_valiset_stats,
            TEST: train_testset_stats
        }

        # save training statistics
        if is_on_master():
            if not skip_save_training_statistics:
                save_json(training_stats_fn, train_stats)

        # grab the results of the model with highest validation test performance
        validation_field = trainer.validation_field
        validation_metric = trainer.validation_metric
        validation_field_result = train_valiset_stats[validation_field]

        best_function = get_best_function(validation_metric)
        # results of the model with highest validation test performance
        if is_on_master() and validation_set is not None:
            epoch_best_vali_metric, best_vali_metric = best_function(
                enumerate(validation_field_result[validation_metric]),
                key=lambda pair: pair[1]
            )
            logger.info(
                'Best validation model epoch: {0}'.format(
                    epoch_best_vali_metric + 1)
            )
            logger.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_metric, validation_field, best_vali_metric
                ))
            if test_set is not None:
                best_vali_metric_epoch_test_metric = train_testset_stats[
                    validation_field][validation_metric][
                    epoch_best_vali_metric]

                logger.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_metric,
                        validation_field,
                        best_vali_metric_epoch_test_metric
                    )
                )
            logger.info(
                '\nFinished: {0}_{1}'.format(experiment_name, model_name))
            logger.info('Saved to: {0}'.format(output_directory))

        contrib_command("train_save", output_directory)

        self.training_set_metadata = training_set_metadata

        if not skip_save_model:
            # Load the best weights from saved checkpoint
            self.load_weights(model_dir)

        return train_stats, preprocessed_data, output_directory
Example #9
0
def train(
        training_set,
        validation_set,
        test_set,
        model_definition,
        save_path='model',
        model_load_path=None,
        resume=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        use_horovod=None,
        random_seed=default_random_seed,
        debug=False
):
    """
    :param training_set: Dataset contaning training data
    :type training_set: Dataset
    :param validation_set: Dataset contaning validation data
    :type validation_set: Datasetk
    :param test_set: Dataset contaning test data.
    :type test_set: Dataset
    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param save_path: The path to save the model to.
    :type save_path: filepath (str)
    :param model_load_path: If this is specified the loaded model will be used
           as initialization (useful for transfer learning).
    :type model_load_path: filepath (str)
    :param skip_save_model: Disables
               saving model weights and hyperparameters each time the model
           improves. By default Ludwig saves model weights after each epoch
           the validation metric imrpvoes, but if the model is really big
           that can be time consuming if you do not want to keep
           the weights and just find out what performance can a model get
           with a set of hyperparameters, use this parameter to skip it,
           but the model will not be loadable later on.
    :type skip_save_model: Boolean
    :param skip_save_progress: Disables saving
           progress each epoch. By default Ludwig saves weights and stats
           after each epoch for enabling resuming of training, but if
           the model is really big that can be time consuming and will uses
           twice as much space, use this parameter to skip it, but training
           cannot be resumed later on.
    :type skip_save_progress: Boolean
    :param skip_save_log: Disables saving TensorBoard
           logs. By default Ludwig saves logs for the TensorBoard, but if it
           is not needed turning it off can slightly increase the
           overall speed..
    :type skip_save_log: Boolean
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_memory_limit: maximum memory in MB to allocate per GPU device.
    :type gpu_memory_limit: Integer
    :param allow_parallel_threads: allow TensorFlow to use multithreading parallelism
           to improve performance at the cost of determinism.
    :type allow_parallel_threads: Boolean
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean
    :returns: None
    """
    if model_load_path is not None:
        # Load model
        if is_on_master():
            print_boxed('LOADING MODEL')
            logger.info('Loading model: {}\n'.format(model_load_path))
        model, _ = load_model_and_definition(model_load_path,
                                             use_horovod=use_horovod)
    else:
        # Build model
        if is_on_master():
            print_boxed('BUILDING MODEL', print_fun=logger.debug)

        model = Trainer(
            model_definition['input_features'],
            model_definition['output_features'],
            model_definition['combiner'],
            model_definition[TRAINING],
            model_definition['preprocessing'],
            use_horovod=use_horovod,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            random_seed=random_seed,
            debug=debug
        )

    contrib_command("train_model", model, model_definition, model_load_path)

    # Train model
    if is_on_master():
        print_boxed('TRAINING')
    return model, model.train(
        training_set,
        validation_set=validation_set,
        test_set=test_set,
        save_path=save_path,
        resume=resume,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        random_seed=random_seed,
        **model_definition[TRAINING]
    )