Ejemplo n.º 1
0
def experiment(model_definition,
               model_definition_file=None,
               data_csv=None,
               data_train_csv=None,
               data_validation_csv=None,
               data_test_csv=None,
               data_hdf5=None,
               data_train_hdf5=None,
               data_validation_hdf5=None,
               data_test_hdf5=None,
               train_set_metadata_json=None,
               experiment_name='experiment',
               model_name='run',
               model_load_path=None,
               model_resume_path=None,
               skip_save_model=False,
               skip_save_progress=False,
               skip_save_log=False,
               skip_save_processed_input=False,
               skip_save_unprocessed_output=False,
               output_directory='results',
               gpus=None,
               gpu_fraction=1.0,
               use_horovod=False,
               random_seed=default_random_seed,
               debug=False,
               **kwargs):
    """Trains a model on a dataset's training and validation splits and
    uses it to predict on the test split.
    It saves the trained model and the statistics of training and testing.
    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param model_definition_file: The file that specifies the model definition.
           It is a yaml file.
    :type model_definition_file: filepath (str)
    :param data_csv: A CSV file contanining the input data which is used to
           train, validate and test a model. The CSV either contains a
           split column or will be split.
    :type data_csv: filepath (str)
    :param data_train_csv: A CSV file contanining the input data which is used
           to train a model.
    :type data_train_csv: filepath (str)
    :param data_validation_csv: A CSV file contanining the input data which is used
           to validate a model..
    :type data_validation_csv: filepath (str)
    :param data_test_csv: A CSV file contanining the input data which is used
           to test a model.
    :type data_test_csv: filepath (str)
    :param data_hdf5: If the dataset is in the hdf5 format, this is used instead
           of the csv file.
    :type data_hdf5: filepath (str)
    :param data_train_hdf5: If the training set is in the hdf5 format, this is
           used instead of the csv file.
    :type data_train_hdf5: filepath (str)
    :param data_validation_hdf5: If the validation set is in the hdf5 format,
           this is used instead of the csv file.
    :type data_validation_hdf5: filepath (str)
    :param data_test_hdf5: If the test set is in the hdf5 format, this is
           used instead of the csv file.
    :type data_test_hdf5: filepath (str)
    :param train_set_metadata_json: If the dataset is in hdf5 format, this is
           the associated json file containing metadata.
    :type train_set_metadata_json: filepath (str)
    :param experiment_name: The name for the experiment.
    :type experiment_name: Str
    :param model_name: Name of the model that is being used.
    :type model_name: Str
    :param model_load_path: If this is specified the loaded model will be used
           as initialization (useful for transfer learning).
    :type model_load_path: filepath (str)
    :param model_resume_path: Resumes training of the model from the path
           specified. The difference with model_load_path is that also training
           statistics like the current epoch and the loss and performance so
           far are also resumed effectively cotinuing a previously interrupted
           training process.
    :type model_resume_path: filepath (str)
    :param skip_save_model: Disables
               saving model weights and hyperparameters each time the model
           improves. By default Ludwig saves model weights after each epoch
           the validation measure imrpvoes, but if the model is really big
           that can be time consuming if you do not want to keep
           the weights and just find out what performance can a model get
           with a set of hyperparameters, use this parameter to skip it,
           but the model will not be loadable later on.
    :type skip_save_model: Boolean
    :param skip_save_progress: Disables saving
           progress each epoch. By default Ludwig saves weights and stats
           after each epoch for enabling resuming of training, but if
           the model is really big that can be time consuming and will uses
           twice as much space, use this parameter to skip it, but training
           cannot be resumed later on.
    :type skip_save_progress: Boolean
    :param skip_save_log: Disables saving TensorBoard
           logs. By default Ludwig saves logs for the TensorBoard, but if it
           is not needed turning it off can slightly increase the
           overall speed..
    :type skip_save_log: Boolean
    :param skip_save_processed_input: If a CSV dataset is provided it is
           preprocessed and then saved as an hdf5 and json to avoid running
           the preprocessing again. If this parameter is False,
           the hdf5 and json file are not saved.
    :type skip_save_processed_input: Boolean
    :param skip_save_unprocessed_output: By default predictions and
           their probabilities are saved in both raw unprocessed numpy files
           contaning tensors and as postprocessed CSV files
           (one for each output feature). If this parameter is True,
           only the CSV ones are saved and the numpy ones are skipped.
    :type skip_save_unprocessed_output: Boolean
    :param output_directory: The directory that will contanin the training
           statistics, the saved model and the training procgress files.
    :type output_directory: filepath (str)
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_fraction: Fraction of the memory of each GPU to use at
           the beginning of the training. The memory may grow elastically.
    :type gpu_fraction: Integer
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean
    """
    # set input features defaults
    if model_definition_file is not None:
        with open(model_definition_file, 'r') as def_file:
            model_definition = merge_with_defaults(yaml.load(def_file))
    else:
        model_definition = merge_with_defaults(model_definition)

    # setup directories and file names
    experiment_dir_name = None
    if model_resume_path is not None:
        if os.path.exists(model_resume_path):
            experiment_dir_name = model_resume_path
        else:
            if is_on_master():
                logging.info('Model resume path does not exists, '
                             'starting training from scratch')
            model_resume_path = None

    if model_resume_path is None:
        if is_on_master():
            experiment_dir_name = get_experiment_dir_name(
                output_directory, experiment_name, model_name)
        else:
            experiment_dir_name = '/'
    description_fn, training_stats_fn, model_dir = get_file_names(
        experiment_dir_name)

    # save description
    description = get_experiment_description(
        model_definition, data_csv, data_train_csv, data_validation_csv,
        data_test_csv, data_hdf5, data_train_hdf5, data_validation_hdf5,
        data_test_hdf5, train_set_metadata_json, random_seed)
    if is_on_master():
        save_json(description_fn, description)
        # print description
        logging.info('Experiment name: {}'.format(experiment_name))
        logging.info('Model name: {}'.format(model_name))
        logging.info('Output path: {}'.format(experiment_dir_name))
        logging.info('')
        for key, value in description.items():
            logging.info('{}: {}'.format(key, pformat(value, indent=4)))
        logging.info('')

    # preprocess
    (training_set, validation_set, test_set,
     train_set_metadata) = preprocess_for_training(
         model_definition,
         data_csv=data_csv,
         data_train_csv=data_train_csv,
         data_validation_csv=data_validation_csv,
         data_test_csv=data_test_csv,
         data_hdf5=data_hdf5,
         data_train_hdf5=data_train_hdf5,
         data_validation_hdf5=data_validation_hdf5,
         data_test_hdf5=data_test_hdf5,
         train_set_metadata_json=train_set_metadata_json,
         skip_save_processed_input=skip_save_processed_input,
         preprocessing_params=model_definition['preprocessing'],
         random_seed=random_seed)
    if is_on_master():
        logging.info('Training set: {0}'.format(training_set.size))
        if validation_set is not None:
            logging.info('Validation set: {0}'.format(validation_set.size))
        if test_set is not None:
            logging.info('Test set: {0}'.format(test_set.size))

    # update model definition with metadata properties
    update_model_definition_with_metadata(model_definition, train_set_metadata)

    # run the experiment
    model, training_results = train(training_set=training_set,
                                    validation_set=validation_set,
                                    test_set=test_set,
                                    model_definition=model_definition,
                                    save_path=model_dir,
                                    model_load_path=model_load_path,
                                    resume=model_resume_path is not None,
                                    skip_save_model=skip_save_model,
                                    skip_save_progress=skip_save_progress,
                                    skip_save_log=skip_save_log,
                                    gpus=gpus,
                                    gpu_fraction=gpu_fraction,
                                    use_horovod=use_horovod,
                                    random_seed=random_seed,
                                    debug=debug)
    (train_trainset_stats, train_valisest_stats,
     train_testset_stats) = training_results

    if is_on_master():
        if not skip_save_model:
            # save train set metadata
            save_json(os.path.join(model_dir, TRAIN_SET_METADATA_FILE_NAME),
                      train_set_metadata)

    # grab the results of the model with highest validation test performance
    validation_field = model_definition['training']['validation_field']
    validation_measure = model_definition['training']['validation_measure']
    validation_field_result = train_valisest_stats[validation_field]

    best_function = get_best_function(validation_measure)

    # print results of the model with highest validation test performance
    if is_on_master():
        if validation_set is not None:
            # max or min depending on the measure
            epoch_best_vali_measure, best_vali_measure = best_function(
                enumerate(validation_field_result[validation_measure]),
                key=lambda pair: pair[1])
            logging.info('Best validation model epoch: {0}'.format(
                epoch_best_vali_measure + 1))
            logging.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_measure, validation_field, best_vali_measure))

            if test_set is not None:
                best_vali_measure_epoch_test_measure = train_testset_stats[
                    validation_field][validation_measure][
                        epoch_best_vali_measure]
                logging.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_measure, validation_field,
                        best_vali_measure_epoch_test_measure))

    # save training statistics
    if is_on_master():
        save_json(
            training_stats_fn, {
                'train': train_trainset_stats,
                'validation': train_valisest_stats,
                'test': train_testset_stats
            })

    if test_set is not None:
        # predict
        test_results = predict(test_set,
                               train_set_metadata,
                               model,
                               model_definition,
                               model_definition['training']['batch_size'],
                               only_predictions=False,
                               gpus=gpus,
                               gpu_fraction=gpu_fraction,
                               debug=debug)
        # postprocess
        postprocessed_output = postprocess(
            test_results, model_definition['output_features'],
            train_set_metadata, experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master())

        if is_on_master():
            print_prediction_results(test_results)

            save_prediction_outputs(postprocessed_output, experiment_dir_name)
            save_prediction_statistics(test_results, experiment_dir_name)

    model.close_session()

    if is_on_master():
        logging.info('\nFinished: {0}_{1}'.format(experiment_name, model_name))
        logging.info('Saved to: {}'.format(experiment_dir_name))

    return experiment_dir_name
Ejemplo n.º 2
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        evaluate_performance=False,
        skip_save_unprocessed_output=False,
        gpus=None,
        gpu_fraction=1,
    ):

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        num_overrides = override_in_memory_flag(
            self.model_definition['input_features'], True)
        if num_overrides > 0:
            logger.warning(
                'Using in_memory = False is not supported for Ludwig API.')

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )

        return postprocessed_predictions, predict_results
Ejemplo n.º 3
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        only_predictions=True,
        logging_level=logging.ERROR,
    ):
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logging.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = self.model_definition['input_features']
        if not only_predictions:
            features_to_load += self.model_definition['output_features']
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            ([] if only_predictions else
             self.model_definition['output_features']), [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          [] if only_predictions else
                          self.model_definition['output_features'], None)

        logging.debug('Predicting')
        predict_results = self.model.predict(dataset,
                                             batch_size,
                                             only_predictions=only_predictions,
                                             gpus=gpus,
                                             gpu_fraction=gpu_fraction,
                                             session=getattr(
                                                 self.model, 'session', None))

        if not only_predictions:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logging.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logging.warning('Unrecognized return_type: {}. '
                            'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results
Ejemplo n.º 4
0
def full_experiment(model_definition,
                    model_definition_file=None,
                    data_df=None,
                    data_train_df=None,
                    data_validation_df=None,
                    data_test_df=None,
                    data_csv=None,
                    data_train_csv=None,
                    data_validation_csv=None,
                    data_test_csv=None,
                    data_hdf5=None,
                    data_train_hdf5=None,
                    data_validation_hdf5=None,
                    data_test_hdf5=None,
                    train_set_metadata_json=None,
                    experiment_name='experiment',
                    model_name='run',
                    model_load_path=None,
                    model_resume_path=None,
                    skip_save_training_description=False,
                    skip_save_training_statistics=False,
                    skip_save_model=False,
                    skip_save_progress=False,
                    skip_save_log=False,
                    skip_save_processed_input=False,
                    skip_save_unprocessed_output=False,
                    skip_save_test_predictions=False,
                    skip_save_test_statistics=False,
                    output_directory='results',
                    gpus=None,
                    gpu_memory_limit=None,
                    allow_parallel_threads=True,
                    use_horovod=None,
                    random_seed=default_random_seed,
                    debug=False,
                    **kwargs):
    """Trains a model on a dataset's training and validation splits and
    uses it to predict on the test split.
    It saves the trained model and the statistics of training and testing.
    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param model_definition_file: The file that specifies the model definition.
           It is a yaml file.
    :type model_definition_file: filepath (str)
    :param data_csv: A CSV file containing the input data which is used to
           train, validate and test a model. The CSV either contains a
           split column or will be split.
    :type data_csv: filepath (str)
    :param data_train_csv: A CSV file containing the input data which is used
           to train a model.
    :type data_train_csv: filepath (str)
    :param data_validation_csv: A CSV file containing the input data which is used
           to validate a model..
    :type data_validation_csv: filepath (str)
    :param data_test_csv: A CSV file containing the input data which is used
           to test a model.
    :type data_test_csv: filepath (str)
    :param data_hdf5: If the dataset is in the hdf5 format, this is used instead
           of the csv file.
    :type data_hdf5: filepath (str)
    :param data_train_hdf5: If the training set is in the hdf5 format, this is
           used instead of the csv file.
    :type data_train_hdf5: filepath (str)
    :param data_validation_hdf5: If the validation set is in the hdf5 format,
           this is used instead of the csv file.
    :type data_validation_hdf5: filepath (str)
    :param data_test_hdf5: If the test set is in the hdf5 format, this is
           used instead of the csv file.
    :type data_test_hdf5: filepath (str)
    :param train_set_metadata_json: If the dataset is in hdf5 format, this is
           the associated json file containing metadata.
    :type train_set_metadata_json: filepath (str)
    :param experiment_name: The name for the experiment.
    :type experiment_name: Str
    :param model_name: Name of the model that is being used.
    :type model_name: Str
    :param model_load_path: If this is specified the loaded model will be used
           as initialization (useful for transfer learning).
    :type model_load_path: filepath (str)
    :param model_resume_path: Resumes training of the model from the path
           specified. The difference with model_load_path is that also training
           statistics like the current epoch and the loss and performance so
           far are also resumed effectively continuing a previously interrupted
           training process.
    :type model_resume_path: filepath (str)
    :param skip_save_training_description: Disables saving
           the description JSON file.
    :type skip_save_training_description: Boolean
    :param skip_save_training_statistics: Disables saving
           training statistics JSON file.
    :type skip_save_training_statistics: Boolean
    :param skip_save_model: Disables
               saving model weights and hyperparameters each time the model
           improves. By default Ludwig saves model weights after each epoch
           the validation metric improves, but if the model is really big
           that can be time consuming if you do not want to keep
           the weights and just find out what performance can a model get
           with a set of hyperparameters, use this parameter to skip it,
           but the model will not be loadable later on.
    :type skip_save_model: Boolean
    :param skip_save_progress: Disables saving
           progress each epoch. By default Ludwig saves weights and stats
           after each epoch for enabling resuming of training, but if
           the model is really big that can be time consuming and will uses
           twice as much space, use this parameter to skip it, but training
           cannot be resumed later on.
    :type skip_save_progress: Boolean
    :param skip_save_log: Disables saving TensorBoard
           logs. By default Ludwig saves logs for the TensorBoard, but if it
           is not needed turning it off can slightly increase the
           overall speed..
    :type skip_save_log: Boolean
    :param skip_save_processed_input: If a CSV dataset is provided it is
           preprocessed and then saved as an hdf5 and json to avoid running
           the preprocessing again. If this parameter is False,
           the hdf5 and json file are not saved.
    :type skip_save_processed_input: Boolean
    :param skip_save_unprocessed_output: By default predictions and
           their probabilities are saved in both raw unprocessed numpy files
           containing tensors and as postprocessed CSV files
           (one for each output feature). If this parameter is True,
           only the CSV ones are saved and the numpy ones are skipped.
    :type skip_save_unprocessed_output: Boolean
    :param skip_save_test_predictions: skips saving test predictions CSV files
    :type skip_save_test_predictions: Boolean
    :param skip_save_test_statistics: skips saving test statistics JSON file
    :type skip_save_test_statistics: Boolean
    :param output_directory: The directory that will contain the training
           statistics, the saved model and the training progress files.
    :type output_directory: filepath (str)
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_memory_limit: maximum memory in MB to allocate per GPU device.
    :type gpu_memory_limit: Integer
    :param allow_parallel_threads: allow TensorFlow to use multithreading parallelism
           to improve performance at the cost of determinism.
    :type allow_parallel_threads: Boolean
    :param use_horovod: Flag for using horovod
    :type use_horovod: Boolean
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean
    """
    set_on_master(use_horovod)

    (
        model,
        preprocessed_data,
        experiment_dir_name,
        _,  # train_stats
        model_definition,
        test_results) = experiment(
            model_definition,
            model_definition_file=model_definition_file,
            data_df=data_df,
            data_train_df=data_train_df,
            data_validation_df=data_validation_df,
            data_test_df=data_test_df,
            data_csv=data_csv,
            data_train_csv=data_train_csv,
            data_validation_csv=data_validation_csv,
            data_test_csv=data_test_csv,
            data_hdf5=data_hdf5,
            data_train_hdf5=data_train_hdf5,
            data_validation_hdf5=data_validation_hdf5,
            data_test_hdf5=data_test_hdf5,
            train_set_metadata_json=train_set_metadata_json,
            experiment_name=experiment_name,
            model_name=model_name,
            model_load_path=model_load_path,
            model_resume_path=model_resume_path,
            skip_save_training_description=skip_save_training_description,
            skip_save_training_statistics=skip_save_training_statistics,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            skip_save_processed_input=skip_save_processed_input,
            output_directory=output_directory,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            use_horovod=use_horovod,
            random_seed=random_seed,
            debug=debug,
            **kwargs)

    (training_set, validation_set, test_set,
     train_set_metadata) = preprocessed_data

    if test_set is not None:
        # check if we need to create the output dir
        if is_on_master():
            if not (skip_save_unprocessed_output and skip_save_test_predictions
                    and skip_save_test_statistics):
                if not os.path.exists(experiment_dir_name):
                    os.makedirs(experiment_dir_name)

        # postprocess
        postprocessed_output = postprocess(
            test_results, model_definition['output_features'],
            train_set_metadata, experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master())

        if is_on_master():
            print_test_results(test_results)
            if not skip_save_test_predictions:
                save_prediction_outputs(postprocessed_output,
                                        experiment_dir_name)
            if not skip_save_test_statistics:
                save_test_statistics(test_results, experiment_dir_name)

    if is_on_master():
        logger.info('\nFinished: {0}_{1}'.format(experiment_name, model_name))
        logger.info('Saved to: {}'.format(experiment_dir_name))

    contrib_command("experiment_save", experiment_dir_name)
    return experiment_dir_name
Ejemplo n.º 5
0
def kfold_cross_validate(num_folds,
                         model_definition=None,
                         model_definition_file=None,
                         data_csv=None,
                         output_directory='results',
                         random_seed=default_random_seed,
                         **kwargs):
    # check for k_fold
    if num_folds is None:
        raise ValueError('k_fold parameter must be specified')

    # check for model_definition and model_definition_file
    if model_definition is None and model_definition_file is None:
        raise ValueError(
            'Either model_definition of model_definition_file have to be'
            'not None to initialize a LudwigModel')
    if model_definition is not None and model_definition_file is not None:
        raise ValueError('Only one between model_definition and '
                         'model_definition_file can be provided')

    logger.info('starting {:d}-fold cross validation'.format(num_folds))

    # extract out model definition for use
    if model_definition_file is not None:
        with open(model_definition_file, 'r') as def_file:
            model_definition = \
                merge_with_defaults(yaml.safe_load(def_file))

    # create output_directory if not available
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)

    # read in data to split for the folds
    data_df = pd.read_csv(data_csv)

    # place each fold in a separate directory
    data_dir = os.path.dirname(data_csv)

    kfold_cv_stats = {}
    kfold_split_indices = {}

    for train_indices, test_indices, fold_num in \
            generate_kfold_splits(data_df, num_folds, random_seed):
        with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name:
            curr_train_df = data_df.iloc[train_indices]
            curr_test_df = data_df.iloc[test_indices]

            kfold_split_indices['fold_' + str(fold_num)] = {
                'training_indices': train_indices,
                'test_indices': test_indices
            }

            # train and validate model on this fold
            logger.info("training on fold {:d}".format(fold_num))
            (
                _,  # model
                preprocessed_data,  # preprocessed_data
                experiment_dir_name,  # experiment_dir_name
                train_stats,
                model_definition,
                test_results) = experiment(model_definition,
                                           data_train_df=curr_train_df,
                                           data_test_df=curr_test_df,
                                           experiment_name='cross_validation',
                                           model_name='fold_' + str(fold_num),
                                           output_directory=os.path.join(
                                               temp_dir_name, 'results'))

            # todo this works for obtaining the postprocessed prediction
            #  and replace the raw ones, but some refactoring is needed to
            #  avoid having to do it
            postprocessed_output = postprocess(
                test_results,
                model_definition['output_features'],
                metadata=preprocessed_data[3],
                experiment_dir_name=experiment_dir_name,
                skip_save_unprocessed_output=True)
            # todo if we want to save the csv of predictions uncomment block
            # if is_on_master():
            #     print_test_results(test_results)
            #     if not skip_save_test_predictions:
            #         save_prediction_outputs(
            #             postprocessed_output,
            #             experiment_dir_name
            #         )
            #     if not skip_save_test_statistics:
            #         save_test_statistics(test_results, experiment_dir_name)

            # augment the training statistics with scoring metric from
            # the hold out fold
            train_stats['fold_test_results'] = test_results

            # collect training statistics for this fold
            kfold_cv_stats['fold_' + str(fold_num)] = train_stats

    # consolidate raw fold metrics across all folds
    raw_kfold_stats = {}
    for fold_name in kfold_cv_stats:
        curr_fold_test_results = kfold_cv_stats[fold_name]['fold_test_results']
        for of_name in curr_fold_test_results:
            if of_name not in raw_kfold_stats:
                raw_kfold_stats[of_name] = {}
            fold_test_results_of = curr_fold_test_results[of_name]

            for metric in fold_test_results_of:
                if metric not in {
                        'predictions', 'probabilities', 'confusion_matrix',
                        'overall_stats', 'per_class_stats', 'roc_curve',
                        'precision_recall_curve'
                }:
                    if metric not in raw_kfold_stats[of_name]:
                        raw_kfold_stats[of_name][metric] = []
                    raw_kfold_stats[of_name][metric].append(
                        fold_test_results_of[metric])

    # calculate overall kfold statistics
    overall_kfold_stats = {}
    for of_name in raw_kfold_stats:
        overall_kfold_stats[of_name] = {}
        for metric in raw_kfold_stats[of_name]:
            mean = np.mean(raw_kfold_stats[of_name][metric])
            std = np.std(raw_kfold_stats[of_name][metric])
            overall_kfold_stats[of_name][metric + '_mean'] = mean
            overall_kfold_stats[of_name][metric + '_std'] = std

    kfold_cv_stats['overall'] = overall_kfold_stats

    logger.info('completed {:d}-fold cross validation'.format(num_folds))

    return kfold_cv_stats, kfold_split_indices
Ejemplo n.º 6
0
def full_predict(model_path,
                 data_csv=None,
                 data_hdf5=None,
                 split='test',
                 batch_size=128,
                 skip_save_unprocessed_output=False,
                 output_directory='results',
                 evaluate_performance=True,
                 gpus=None,
                 gpu_fraction=1.0,
                 use_horovod=False,
                 debug=False,
                 **kwargs):
    # setup directories and file names
    experiment_dir_name = output_directory
    suffix = 0
    while os.path.exists(experiment_dir_name):
        experiment_dir_name = output_directory + '_' + str(suffix)
        suffix += 1

    if is_on_master():
        logging.info('Dataset path: {}'.format(
            data_csv if data_csv is not None else data_hdf5))
        logging.info('Model path: {}'.format(model_path))
        logging.info('Output path: {}'.format(experiment_dir_name))
        logging.info('')

    train_set_metadata_json_fp = os.path.join(model_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path, split, data_csv, data_hdf5, train_set_metadata_json_fp,
        evaluate_performance)

    # run the prediction
    if is_on_master():
        print_boxed('LOADING MODEL')
    model, model_definition = load_model_and_definition(
        model_path, use_horovod=use_horovod)

    prediction_results = predict(dataset, train_set_metadata, model,
                                 model_definition, batch_size,
                                 evaluate_performance, gpus, gpu_fraction,
                                 debug)
    model.close_session()

    if is_on_master():
        os.mkdir(experiment_dir_name)

        # postprocess
        postprocessed_output = postprocess(
            prediction_results, model_definition['output_features'],
            train_set_metadata, experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master())

        save_prediction_outputs(postprocessed_output, experiment_dir_name)

        if evaluate_performance:
            print_prediction_results(prediction_results)
            save_prediction_statistics(prediction_results, experiment_dir_name)

        logging.info('Saved to: {0}'.format(experiment_dir_name))
Ejemplo n.º 7
0
def full_predict(model_path,
                 data_csv=None,
                 data_hdf5=None,
                 split=TEST,
                 batch_size=128,
                 skip_save_unprocessed_output=False,
                 skip_save_test_predictions=False,
                 skip_save_test_statistics=False,
                 output_directory='results',
                 evaluate_performance=True,
                 gpus=None,
                 gpu_fraction=1.0,
                 use_horovod=False,
                 debug=False,
                 **kwargs):
    if is_on_master():
        logger.info('Dataset path: {}'.format(
            data_csv if data_csv is not None else data_hdf5))
        logger.info('Model path: {}'.format(model_path))
        logger.info('')

    train_set_metadata_json_fp = os.path.join(model_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path, split, data_csv, data_hdf5, train_set_metadata_json_fp,
        evaluate_performance)

    # run the prediction
    if is_on_master():
        print_boxed('LOADING MODEL')
    model, model_definition = load_model_and_definition(
        model_path, use_horovod=use_horovod)

    prediction_results = predict(dataset, train_set_metadata, model,
                                 model_definition, batch_size,
                                 evaluate_performance, gpus, gpu_fraction,
                                 debug)
    model.close_session()

    if is_on_master():
        # setup directories and file names
        experiment_dir_name = find_non_existing_dir_by_adding_suffix(
            output_directory)

        # if we are skipping all saving,
        # there is no need to create a directory that will remain empty
        should_create_exp_dir = not (skip_save_unprocessed_output
                                     and skip_save_test_predictions
                                     and skip_save_test_statistics)
        if should_create_exp_dir:
            os.makedirs(experiment_dir_name)

        # postprocess
        postprocessed_output = postprocess(
            prediction_results, model_definition['output_features'],
            train_set_metadata, experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master())

        if not skip_save_test_predictions:
            save_prediction_outputs(postprocessed_output, experiment_dir_name)

        if evaluate_performance:
            print_test_results(prediction_results)
            if not skip_save_test_statistics:
                save_test_statistics(prediction_results, experiment_dir_name)

        logger.info('Saved to: {0}'.format(experiment_dir_name))
Ejemplo n.º 8
0
def train_and_eval_on_split(
        model_definition,
        eval_split=VALIDATION,
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        experiment_name="hyperopt",
        model_name="run",
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=False,
        skip_save_unprocessed_output=False,
        skip_save_test_predictions=False,
        skip_save_test_statistics=False,
        output_directory="results",
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        use_horovod=False,
        random_seed=default_random_seed,
        debug=False,
        **kwargs
):
    # Collect training and validation losses and metrics
    # & append it to `results`
    # ludwig_model = LudwigModel(modified_model_definition)
    (model, preprocessed_data, experiment_dir_name, train_stats,
     model_definition) = full_train(
        model_definition=model_definition,
        data_df=data_df,
        data_train_df=data_train_df,
        data_validation_df=data_validation_df,
        data_test_df=data_test_df,
        data_csv=data_csv,
        data_train_csv=data_train_csv,
        data_validation_csv=data_validation_csv,
        data_test_csv=data_test_csv,
        data_hdf5=data_hdf5,
        data_train_hdf5=data_train_hdf5,
        data_validation_hdf5=data_validation_hdf5,
        data_test_hdf5=data_test_hdf5,
        train_set_metadata_json=train_set_metadata_json,
        experiment_name=experiment_name,
        model_name=model_name,
        # model_load_path=model_load_path,
        # model_resume_path=model_resume_path,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        output_directory=output_directory,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        use_horovod=use_horovod,
        random_seed=random_seed,
        debug=debug,
    )
    (training_set, validation_set, test_set,
     train_set_metadata) = preprocessed_data
    if model_definition[TRAINING]["eval_batch_size"] > 0:
        batch_size = model_definition[TRAINING]["eval_batch_size"]
    else:
        batch_size = model_definition[TRAINING]["batch_size"]

    eval_set = validation_set
    if eval_split == TRAINING:
        eval_set = training_set
    elif eval_split == VALIDATION:
        eval_set = validation_set
    elif eval_split == TEST:
        eval_set = test_set

    test_results = predict(
        eval_set,
        train_set_metadata,
        model,
        model_definition,
        batch_size,
        evaluate_performance=True,
        debug=debug
    )
    if not (
            skip_save_unprocessed_output and skip_save_test_predictions and skip_save_test_statistics):
        if not os.path.exists(experiment_dir_name):
            os.makedirs(experiment_dir_name)

    # postprocess
    postprocessed_output = postprocess(
        test_results,
        model_definition["output_features"],
        train_set_metadata,
        experiment_dir_name,
        skip_save_unprocessed_output,
    )

    print_test_results(test_results)
    if not skip_save_test_predictions:
        save_prediction_outputs(postprocessed_output, experiment_dir_name)
    if not skip_save_test_statistics:
        save_test_statistics(test_results, experiment_dir_name)
    return train_stats, test_results
Ejemplo n.º 9
0
    def evaluate(
            self,
            dataset=None,
            data_format=None,
            batch_size=128,
            skip_save_unprocessed_output=True,
            skip_save_predictions=True,
            skip_save_eval_stats=True,
            collect_predictions=False,
            collect_overall_stats=False,
            output_directory='results',
            return_type=pd.DataFrame,
            debug=False,
            **kwargs
    ):
        self._check_initialization()

        logger.debug('Preprocessing')

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=True,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        stats, predictions = predictor.batch_evaluation(
            self.model,
            dataset,
            collect_predictions=collect_predictions or collect_overall_stats,
        )

        # calculate the overall metrics
        if collect_overall_stats:
            overall_stats = calculate_overall_stats(
                self.model.output_features,
                predictions,
                dataset,
                training_set_metadata
            )
            stats = {of_name: {**stats[of_name], **overall_stats[of_name]}
            # account for presence of 'combined' key
            if of_name in overall_stats else {**stats[of_name]}
                     for of_name in stats}

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (
                    skip_save_unprocessed_output and
                    skip_save_predictions and
                    skip_save_eval_stats
            )
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        if collect_predictions:
            logger.debug('Postprocessing')
            postproc_predictions = postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                                             or not is_on_master(),
            )
        else:
            postproc_predictions = predictions  # = {}

        if is_on_master():
            if postproc_predictions is not None and not skip_save_predictions:
                save_prediction_outputs(postproc_predictions,
                                        output_directory)

            print_evaluation_stats(stats)
            if not skip_save_eval_stats:
                save_evaluation_stats(stats, output_directory)

            if not skip_save_predictions or not skip_save_eval_stats:
                logger.info('Saved to: {0}'.format(output_directory))

        if collect_predictions:
            postproc_predictions = convert_predictions(
                postproc_predictions,
                self.model.output_features,
                self.training_set_metadata,
                return_type=return_type)

        return stats, postproc_predictions, output_directory
Ejemplo n.º 10
0
    def predict(
            self,
            dataset=None,
            data_format=None,
            batch_size=128,
            skip_save_unprocessed_output=True,
            skip_save_predictions=True,
            output_directory='results',
            return_type=pd.DataFrame,
            debug=False,
            **kwargs
    ):
        self._check_initialization()

        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=False,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        predictions = predictor.batch_predict(
            self.model,
            dataset,
        )

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (
                    skip_save_unprocessed_output and skip_save_predictions
            )
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        logger.debug('Postprocessing')
        postproc_predictions = convert_predictions(
            postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                                             or not is_on_master(),
            ),
            self.model.output_features,
            self.training_set_metadata,
            return_type=return_type
        )

        if is_on_master():
            if not skip_save_predictions:
                save_prediction_outputs(postproc_predictions,
                                        output_directory)

                logger.info('Saved to: {0}'.format(output_directory))

        return postproc_predictions, output_directory
Ejemplo n.º 11
0
    def evaluate(self,
                 dataset=None,
                 data_format=None,
                 batch_size=128,
                 skip_save_unprocessed_output=True,
                 skip_save_predictions=True,
                 skip_save_eval_stats=True,
                 collect_predictions=False,
                 collect_overall_stats=False,
                 output_directory='results',
                 return_type=pd.DataFrame,
                 debug=False,
                 **kwargs):
        self._check_initialization()

        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'] + \
                           self.model_definition['output_features']

        # preprocessing
        # todo refactoring: maybe replace the self.model_definition paramter
        #  here with features_to_load
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=True,
        )

        logger.debug('Predicting')
        predictor = Predictor(batch_size=batch_size,
                              horovod=self._horovod,
                              debug=debug)
        stats, predictions = predictor.batch_evaluation(
            self.model,
            dataset,
            collect_predictions=collect_predictions or collect_overall_stats,
        )

        # calculate the overall metrics
        if collect_overall_stats:
            overall_stats = calculate_overall_stats(self.model.output_features,
                                                    predictions, dataset,
                                                    training_set_metadata)
            stats = {
                of_name: {
                    **stats[of_name],
                    **overall_stats[of_name]
                }
                # account for presence of 'combined' key
                if of_name in overall_stats else {
                    **stats[of_name]
                }
                for of_name in stats
            }

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (skip_save_unprocessed_output
                                         and skip_save_predictions
                                         and skip_save_eval_stats)
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        if collect_predictions:
            logger.debug('Postprocessing')
            postproc_predictions = postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                or not is_on_master(),
            )
        else:
            postproc_predictions = predictions  # = {}

        if is_on_master():
            if postproc_predictions is not None and not skip_save_predictions:
                save_prediction_outputs(postproc_predictions, output_directory)

            print_evaluation_stats(stats)
            if not skip_save_eval_stats:
                save_evaluation_stats(stats, output_directory)

            if not skip_save_predictions or not skip_save_eval_stats:
                logger.info('Saved to: {0}'.format(output_directory))

        if collect_predictions:
            postproc_predictions = convert_predictions(
                postproc_predictions,
                self.model.output_features,
                self.training_set_metadata,
                return_type=return_type)

        return stats, postproc_predictions, output_directory
Ejemplo n.º 12
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        evaluate_performance=False,
        logging_level=logging.ERROR,
    ):
        logging.getLogger('ludwig').setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results