Beispiel #1
0
    def put(self, training_set, test_set, validation_set,
            training_set_metadata):
        logger.info("Writing preprocessed training set cache")
        training_set = self.dataset_manager.save(
            self.cache_map[TRAINING],
            training_set,
            self.config,
            training_set_metadata,
            TRAINING,
        )

        if test_set is not None:
            logger.info("Writing preprocessed test set cache")
            test_set = self.dataset_manager.save(
                self.cache_map[TEST],
                test_set,
                self.config,
                training_set_metadata,
                TEST,
            )

        if validation_set is not None:
            logger.info("Writing preprocessed validation set cache")
            validation_set = self.dataset_manager.save(
                self.cache_map[VALIDATION],
                validation_set,
                self.config,
                training_set_metadata,
                VALIDATION,
            )

        logger.info("Writing train set metadata")
        data_utils.save_json(self.cache_map[META], training_set_metadata)

        return training_set, test_set, validation_set, training_set_metadata
Beispiel #2
0
def create_metrics_report(experiment_name: str) -> Tuple[Dict[str, Any], str]:
    """Compiles performance and non-performance metrics.

    `experiment_name`: name referring to the experiment.
    Returns a full report and the path where it's saved.
    """
    full_report = dict()
    os.makedirs(os.path.join(os.getcwd(), experiment_name, "metrics_report"), exist_ok=True)
    for tag in [TRAIN_TAG, EVAL_TAG]:
        if tag == TRAIN_TAG:
            resource_usage_path = os.path.join(os.getcwd(), experiment_name, CACHE, "train_resource_usage_metrics.json")
            performance_path = os.path.join(os.getcwd(), experiment_name, EXPERIMENT_RUN, "training_statistics.json")
        elif tag == EVAL_TAG:
            resource_usage_path = os.path.join(
                os.getcwd(), experiment_name, CACHE, "evaluate_resource_usage_metrics.json"
            )
            performance_path = os.path.join(os.getcwd(), experiment_name, EXPERIMENT_RUN, "test_statistics.json")
        else:
            raise ValueError("Tag unrecognized. Please choose 'train' or 'evaluate'.")

        resource_usage_metrics = load_json(resource_usage_path)
        performance_metrics = load_json(performance_path)
        full_report[tag] = merge_dict(performance_metrics, resource_usage_metrics)

    merged_file_path = os.path.join(os.getcwd(), experiment_name, "metrics_report", "{}.json".format("full_report"))
    save_json(merged_file_path, full_report)
    return full_report, merged_file_path
Beispiel #3
0
    def save(self, save_path):
        """This function allows to save models on disk

        # Inputs

        :param  save_path: (string) path to the directory where the model is
                going to be saved. Both a JSON file containing the model
                architecture hyperparameters and checkpoints files containing
                model weights will be saved.


        # Example usage

        ```python
        ludwig_model.save(save_path)
        ```

        """
        self._check_initialization()

        # save model definition
        self.save_model_definition(save_path)

        # save model weights
        model_weights_path = os.path.join(save_path, MODEL_WEIGHTS_FILE_NAME)
        self.model.save_weights(model_weights_path)

        # save training set metadata
        training_set_metadata_path = os.path.join(
            save_path,
            TRAIN_SET_METADATA_FILE_NAME
        )
        save_json(training_set_metadata_path, self.training_set_metadata)
Beispiel #4
0
 def save_model_definition(self, save_path):
     os.makedirs(save_path, exist_ok=True)
     model_hyperparameters_path = os.path.join(
         save_path,
         MODEL_HYPERPARAMETERS_FILE_NAME
     )
     save_json(model_hyperparameters_path, self.model_definition)
Beispiel #5
0
    def save(self, save_path):
        """This function allows to save models on disk

        # Inputs

        :param  save_path: (string) path to the directory where the model is
                going to be saved. Both a JSON file containing the model
                architecture hyperparameters and checkpoints files containing
                model weights will be saved.


        # Example usage

        ```python
        ludwig_model.save(save_path)
        ```

        """
        if self.model is None or self.model_definition is None or self.train_set_metadata is None:
            raise ValueError('Model has not been initialized or loaded')

        model_weights_path = os.path.join(save_path, MODEL_WEIGHTS_FILE_NAME)

        model_hyperparameters_path = os.path.join(
            save_path, MODEL_HYPERPARAMETERS_FILE_NAME)

        self.model.model.save_weights(model_weights_path)

        train_set_metadata_path = os.path.join(save_path,
                                               TRAIN_SET_METADATA_FILE_NAME)
        save_json(train_set_metadata_path, self.train_set_metadata)

        self.model.save_hyperparameters(self.model._hyperparameters,
                                        model_hyperparameters_path)
Beispiel #6
0
def kfold_cross_validate_cli(
        k_fold,
        config=None,
        config_file=None,
        dataset=None,
        data_format=None,
        output_directory='results',
        random_seed=default_random_seed,
        skip_save_k_fold_split_indices=False,
        **kwargs
):
    """Wrapper function to performs k-fold cross validation.

    # Inputs
    :param k_fold: (int) number of folds to create for the cross-validation
    :param config: (dict, default: None) a dictionary containing
            information needed to build a model. Refer to the [User Guide]
           (http://ludwig.ai/user_guide/#model-config) for details.
    :param config_file: (string, optional, default: `None`) path to
           a YAML file containing the config. If available it will be
           used instead of the config dict.
    :param data_csv: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.
    :param skip_save_k_fold_split_indices: (boolean, default: False) Disables
            saving k-fold split indices

    :return: None
    """

    if config is None and config_file is None:
        raise ValueError(
            "No config is provided 'config' or "
            "'config_file' must be provided."
        )
    elif config is not None and config_file is not None:
        raise ValueError(
            "Cannot specify both 'config' and 'config_file'"
            ", proivde only one of the parameters."
        )

    (kfold_cv_stats,
     kfold_split_indices) = kfold_cross_validate(
        k_fold,
        config=config if config is not None else
        config_file,
        dataset=dataset,
        data_format=data_format,
        output_directory=output_directory,
        random_seed=random_seed
    )

    # save k-fold cv statistics
    save_json(os.path.join(output_directory, 'kfold_training_statistics.json'),
              kfold_cv_stats)

    # save k-fold split indices
    if not skip_save_k_fold_split_indices:
        save_json(os.path.join(output_directory, 'kfold_split_indices.json'),
                  kfold_split_indices)
Beispiel #7
0
def save_prediction_outputs(
    postprocessed_output,
    output_directory,
    backend,
):
    postprocessed_output, column_shapes = flatten_df(postprocessed_output, backend)
    postprocessed_output.to_parquet(os.path.join(output_directory, "predictions.parquet"))
    save_json(os.path.join(output_directory, "predictions.shapes.json"), column_shapes)
Beispiel #8
0
    def on_trainer_train_setup(self, trainer, save_path, is_coordinator):
        if not is_coordinator:
            return

        # When running on a remote worker, the model metadata files will only have been
        # saved to the driver process, so re-save it here before uploading.
        training_set_metadata_path = os.path.join(
            save_path, TRAIN_SET_METADATA_FILE_NAME)
        if not os.path.exists(training_set_metadata_path):
            save_json(training_set_metadata_path, self.training_set_metadata)

        model_hyperparameters_path = os.path.join(
            save_path, MODEL_HYPERPARAMETERS_FILE_NAME)
        if not os.path.exists(model_hyperparameters_path):
            save_json(model_hyperparameters_path, self.config)
Beispiel #9
0
def save_prediction_outputs(
    postprocessed_output,
    output_features,
    output_directory,
    backend,
):
    postprocessed_output, column_shapes = flatten_df(postprocessed_output, backend)
    postprocessed_output.to_parquet(os.path.join(output_directory, PREDICTIONS_PARQUET_FILE_NAME))
    save_json(os.path.join(output_directory, PREDICTIONS_SHAPES_FILE_NAME), column_shapes)
    if not backend.df_engine.partitioned:
        # csv can only be written out for unpartitioned df format (i.e., pandas)
        postprocessed_dict = convert_to_dict(postprocessed_output, output_features)
        csv_filename = os.path.join(output_directory, "{}_{}.csv")
        for output_field, outputs in postprocessed_dict.items():
            for output_type, values in outputs.items():
                save_csv(csv_filename.format(output_field, output_type), values)
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        """Waits for monitoring process to exit.

        Computes and postprocesses more metrics. Saves report.
        """
        self.queue.put(STOP_MESSAGE)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        self.p.join()

        self.info = load_json(
            os.path.join(self.output_dir, self.info["tag"] + "_temp.json"))
        os.remove(
            os.path.join(self.output_dir, self.info["tag"] + "_temp.json"))

        self.info["end_time"] = time.time()
        self.info[f"{self.tag}_total_duration"] = self.info[
            "end_time"] - self.info["start_time"]

        if self.num_examples:
            self.info["examples_per_second"] = self.num_examples / self.info[
                f"{self.tag}_total_duration"]
        self.info["end_disk_usage"] = shutil.disk_usage(
            os.path.expanduser("~")).used
        self.info["disk_footprint"] = self.info["end_disk_usage"] - self.info[
            "start_disk_usage"]

        for key in self.info["system"]:
            if "gpu_" in key:
                self.info["system"][key]["max_memory_used"] = max(
                    self.info["system"][key]["memory_used"])
        self.info["system"]["max_cpu_utilization"] = max(
            self.info["system"]["cpu_utilization"], default=None)
        self.info["system"]["max_ram_utilization"] = max(
            self.info["system"]["ram_utilization"], default=None)

        if self.info["system"]["cpu_utilization"]:
            self.info["system"]["average_cpu_utilization"] = mean(
                self.info["system"]["cpu_utilization"])
        if self.info["system"]["ram_utilization"]:
            self.info["system"]["average_ram_utilization"] = mean(
                self.info["system"]["ram_utilization"])

        save_json(
            os.path.join(self.output_dir,
                         self.info["tag"] + "_resource_usage_metrics.json"),
            self.info)
Beispiel #11
0
    def put_dataset(self, input_fname, config, processed,
                    skip_save_processed_input):
        if not self.can_cache(input_fname, config, skip_save_processed_input):
            return processed

        training_set, test_set, validation_set, training_set_metadata = processed
        key = training_set_metadata.get(CHECKSUM)
        if not key:
            key = self.get_cache_key(input_fname, config)
            training_set_metadata[CHECKSUM] = key

        logger.info('Writing preprocessed training set cache')
        training_set = self.save(
            self.get_cache_path(input_fname, key, TRAINING),
            training_set,
            config,
            training_set_metadata,
            TRAINING,
        )

        if test_set is not None:
            logger.info('Writing preprocessed test set cache')
            test_set = self.save(
                self.get_cache_path(input_fname, key, TEST),
                test_set,
                config,
                training_set_metadata,
                TEST,
            )

        if validation_set is not None:
            logger.info('Writing preprocessed validation set cache')
            validation_set = self.save(
                self.get_cache_path(input_fname, key, VALIDATION),
                validation_set,
                config,
                training_set_metadata,
                VALIDATION,
            )

        logger.info('Writing train set metadata')
        data_utils.save_json(
            self.get_cache_path(input_fname, key, 'meta', 'json'),
            training_set_metadata)

        return training_set, test_set, validation_set, training_set_metadata
Beispiel #12
0
def kfold_cross_validate_cli(k_fold,
                             model_definition=None,
                             model_definition_file=None,
                             data_csv=None,
                             output_directory='results',
                             random_seed=default_random_seed,
                             skip_save_k_fold_split_indices=False,
                             **kwargs):
    """Wrapper function to performs k-fold cross validation.

    # Inputs
    :param k_fold: (int) number of folds to create for the cross-validation
    :param model_definition: (dict, default: None) a dictionary containing
            information needed to build a model. Refer to the [User Guide]
           (http://ludwig.ai/user_guide/#model-definition) for details.
    :param model_definition_file: (string, optional, default: `None`) path to
           a YAML file containing the model definition. If available it will be
           used instead of the model_definition dict.
    :param data_csv: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.
    :param skip_save_k_fold_split_indices: (boolean, default: False) Disables
            saving k-fold split indices

    :return: None
    """
    model_definition = check_which_model_definition(model_definition,
                                                    model_definition_file)

    (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(
        k_fold,
        model_definition=model_definition,
        data_csv=data_csv,
        output_directory=output_directory,
        random_seed=random_seed)

    # save k-fold cv statistics
    save_json(os.path.join(output_directory, 'kfold_training_statistics.json'),
              kfold_cv_stats)

    # save k-fold split indices
    if not skip_save_k_fold_split_indices:
        save_json(os.path.join(output_directory, 'kfold_split_indices.json'),
                  kfold_split_indices)
Beispiel #13
0
def kfold_cross_validate_cli(
    k_fold,
    config=None,
    dataset=None,
    data_format=None,
    output_directory="results",
    random_seed=default_random_seed,
    skip_save_k_fold_split_indices=False,
    **kwargs,
):
    """Wrapper function to performs k-fold cross validation.

    # Inputs
    :param k_fold: (int) number of folds to create for the cross-validation
    :param config: (Union[str, dict], default: None) a dictionary or file path
            containing model configuration. Refer to the [User Guide]
           (http://ludwig.ai/user_guide/#model-config) for details.
    :param dataset: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.
    :param skip_save_k_fold_split_indices: (boolean, default: False) Disables
            saving k-fold split indices

    :return: None
    """

    (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(
        k_fold,
        config=config,
        dataset=dataset,
        data_format=data_format,
        output_directory=output_directory,
        random_seed=random_seed,
    )

    # save k-fold cv statistics
    save_json(os.path.join(output_directory, "kfold_training_statistics.json"),
              kfold_cv_stats)

    # save k-fold split indices
    if not skip_save_k_fold_split_indices:
        save_json(os.path.join(output_directory, "kfold_split_indices.json"),
                  kfold_split_indices)
def monitor(queue: multiprocessing.Queue, info: Dict[str, Any],
            output_dir: str, logging_interval: int) -> None:
    """Monitors hardware resource use as part of a separate process.

    Populate `info` with system specific metrics (GPU, CPU, RAM) at a `logging_interval` interval and saves the output
    in `output_dir`.

    Args:
        queue: queue from which we can push and retrieve messages sent to the child process.
        info: dictionary containing system resource usage information about the parent process.
        output_dir: directory where the contents of `info` will be saved.
        logging_interval: time interval at which we will poll the system for usage metrics.
    """
    for key in info["system"]:
        if "gpu_" in key:
            info["system"][key]["memory_used"] = []
    info["system"]["cpu_utilization"] = []
    info["system"]["ram_utilization"] = []

    while True:
        try:
            message = queue.get(block=False)
            if isinstance(message, str):
                if message == STOP_MESSAGE:
                    save_json(
                        os.path.join(output_dir, info["tag"] + "_temp.json"),
                        info)
                    return
            else:
                queue.put(message)
        except EmptyQueueException:
            pass
        if torch.cuda.is_available():
            gpu_infos = GPUStatCollection.new_query()
            for i, gpu_info in enumerate(gpu_infos):
                gpu_key = f"gpu_{i}"
                info["system"][gpu_key]["memory_used"].append(
                    gpu_info.memory_used)
        info["system"]["cpu_utilization"].append(psutil.cpu_percent())
        info["system"]["ram_utilization"].append(
            psutil.virtual_memory().percent)
        time.sleep(logging_interval)
Beispiel #15
0
    def on_trainer_train_setup(self, trainer, save_path, is_coordinator):
        if not is_coordinator:
            return

        # When running on a remote worker, the model metadata files will only have been
        # saved to the driver process, so re-save it here before uploading.
        training_set_metadata_path = os.path.join(save_path, TRAIN_SET_METADATA_FILE_NAME)
        if not os.path.exists(training_set_metadata_path):
            save_json(training_set_metadata_path, self.training_set_metadata)

        model_hyperparameters_path = os.path.join(save_path, MODEL_HYPERPARAMETERS_FILE_NAME)
        if not os.path.exists(model_hyperparameters_path):
            save_json(model_hyperparameters_path, self.config)

        if self.save_in_background:
            save_queue = queue.Queue()
            self.save_fn = lambda args: save_queue.put(args)
            self.save_thread = threading.Thread(target=_log_mlflow_loop, args=(save_queue,))
            self.save_thread.start()
        else:
            self.save_fn = lambda args: _log_mlflow(*args)
Beispiel #16
0
def full_train(model_definition,
               model_definition_file=None,
               data_df=None,
               data_train_df=None,
               data_validation_df=None,
               data_test_df=None,
               data_csv=None,
               data_train_csv=None,
               data_validation_csv=None,
               data_test_csv=None,
               data_hdf5=None,
               data_train_hdf5=None,
               data_validation_hdf5=None,
               data_test_hdf5=None,
               train_set_metadata_json=None,
               experiment_name='experiment',
               model_name='run',
               model_load_path=None,
               model_resume_path=None,
               skip_save_model=False,
               skip_save_progress=False,
               skip_save_log=False,
               skip_save_processed_input=False,
               output_directory='results',
               should_close_session=True,
               gpus=None,
               gpu_fraction=1.0,
               use_horovod=False,
               random_seed=42,
               debug=False,
               **kwargs):
    """*full_train* defines the entire training procedure used by Ludwig's
    internals. Requires most of the parameters that are taken into the model.
    Builds a full ludwig model and performs the training.
    :param data_test_df:
    :param data_df:
    :param data_train_df:
    :param data_validation_df:
    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param model_definition_file: The file that specifies the model definition.
           It is a yaml file.
    :type model_definition_file: filepath (str)
    :param data_csv: A CSV file contanining the input data which is used to
           train, validate and test a model. The CSV either contains a
           split column or will be split.
    :type data_csv: filepath (str)
    :param data_train_csv: A CSV file contanining the input data which is used
           to train a model.
    :type data_train_csv: filepath (str)
    :param data_validation_csv: A CSV file contanining the input data which is used
           to validate a model..
    :type data_validation_csv: filepath (str)
    :param data_test_csv: A CSV file contanining the input data which is used
           to test a model.
    :type data_test_csv: filepath (str)
    :param data_hdf5: If the dataset is in the hdf5 format, this is used instead
           of the csv file.
    :type data_hdf5: filepath (str)
    :param data_train_hdf5: If the training set is in the hdf5 format, this is
           used instead of the csv file.
    :type data_train_hdf5: filepath (str)
    :param data_validation_hdf5: If the validation set is in the hdf5 format,
           this is used instead of the csv file.
    :type data_validation_hdf5: filepath (str)
    :param data_test_hdf5: If the test set is in the hdf5 format, this is
           used instead of the csv file.
    :type data_test_hdf5: filepath (str)
    :param train_set_metadata_json: If the dataset is in hdf5 format, this is
           the associated json file containing metadata.
    :type train_set_metadata_json: filepath (str)
    :param experiment_name: The name for the experiment.
    :type experiment_name: Str
    :param model_name: Name of the model that is being used.
    :type model_name: Str
    :param model_load_path: If this is specified the loaded model will be used
           as initialization (useful for transfer learning).
    :type model_load_path: filepath (str)
    :param model_resume_path: Resumes training of the model from the path
           specified. The difference with model_load_path is that also training
           statistics like the current epoch and the loss and performance so
           far are also resumed effectively cotinuing a previously interrupted
           training process.
    :type model_resume_path: filepath (str)
    :param skip_save_model: Disables
               saving model weights and hyperparameters each time the model
           improves. By default Ludwig saves model weights after each epoch
           the validation measure imrpvoes, but if the model is really big
           that can be time consuming if you do not want to keep
           the weights and just find out what performance can a model get
           with a set of hyperparameters, use this parameter to skip it,
           but the model will not be loadable later on.
    :type skip_save_model: Boolean
    :param skip_save_progress: Disables saving
           progress each epoch. By default Ludwig saves weights and stats
           after each epoch for enabling resuming of training, but if
           the model is really big that can be time consuming and will uses
           twice as much space, use this parameter to skip it, but training
           cannot be resumed later on.
    :type skip_save_progress: Boolean
    :param skip_save_processed_input: If a CSV dataset is provided it is
           preprocessed and then saved as an hdf5 and json to avoid running
           the preprocessing again. If this parameter is False,
           the hdf5 and json file are not saved.
    :type skip_save_processed_input: Boolean
    :param skip_save_log: Disables saving TensorBoard
           logs. By default Ludwig saves logs for the TensorBoard, but if it
           is not needed turning it off can slightly increase the
           overall speed..
    :type skip_save_progress: Boolean
    :param output_directory: The directory that will contanin the training
           statistics, the saved model and the training procgress files.
    :type output_directory: filepath (str)
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_fraction: Fraction of the memory of each GPU to use at
           the beginning of the training. The memory may grow elastically.
    :type gpu_fraction: Integer
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean
    :returns: None
    """
    # set input features defaults
    if model_definition_file is not None:
        with open(model_definition_file, 'r') as def_file:
            model_definition = merge_with_defaults(yaml.safe_load(def_file))
    else:
        model_definition = merge_with_defaults(model_definition)

    # setup directories and file names
    experiment_dir_name = None
    if model_resume_path is not None:
        if os.path.exists(model_resume_path):
            experiment_dir_name = model_resume_path
        else:
            if is_on_master():
                logger.info('Model resume path does not exists, '
                            'starting training from scratch')
            model_resume_path = None

    if model_resume_path is None:
        if is_on_master():
            experiment_dir_name = get_experiment_dir_name(
                output_directory, experiment_name, model_name)
        else:
            experiment_dir_name = '.'

    # if model_load_path is not None, load its train_set_metadata
    if model_load_path is not None:
        train_set_metadata_json = os.path.join(model_load_path,
                                               TRAIN_SET_METADATA_FILE_NAME)

    description_fn, training_stats_fn, model_dir = get_file_names(
        experiment_dir_name)

    # save description
    description = get_experiment_description(
        model_definition,
        data_csv=data_csv,
        data_train_csv=data_train_csv,
        data_validation_csv=data_validation_csv,
        data_test_csv=data_test_csv,
        data_hdf5=data_hdf5,
        data_train_hdf5=data_train_hdf5,
        data_validation_hdf5=data_validation_hdf5,
        data_test_hdf5=data_test_hdf5,
        metadata_json=train_set_metadata_json,
        random_seed=random_seed)
    if is_on_master():
        save_json(description_fn, description)
        # print description
        logger.info('Experiment name: {}'.format(experiment_name))
        logger.info('Model name: {}'.format(model_name))
        logger.info('Output path: {}'.format(experiment_dir_name))
        logger.info('\n')
        for key, value in description.items():
            logger.info('{}: {}'.format(key, pformat(value, indent=4)))
        logger.info('\n')

    # preprocess
    preprocessed_data = preprocess_for_training(
        model_definition,
        data_df=data_df,
        data_train_df=data_train_df,
        data_validation_df=data_validation_df,
        data_test_df=data_test_df,
        data_csv=data_csv,
        data_train_csv=data_train_csv,
        data_validation_csv=data_validation_csv,
        data_test_csv=data_test_csv,
        data_hdf5=data_hdf5,
        data_train_hdf5=data_train_hdf5,
        data_validation_hdf5=data_validation_hdf5,
        data_test_hdf5=data_test_hdf5,
        train_set_metadata_json=train_set_metadata_json,
        skip_save_processed_input=skip_save_processed_input,
        preprocessing_params=model_definition['preprocessing'],
        random_seed=random_seed)

    (training_set, validation_set, test_set,
     train_set_metadata) = preprocessed_data

    if is_on_master():
        logger.info('Training set: {0}'.format(training_set.size))
        if validation_set is not None:
            logger.info('Validation set: {0}'.format(validation_set.size))
        if test_set is not None:
            logger.info('Test set: {0}'.format(test_set.size))

    # update model definition with metadata properties
    update_model_definition_with_metadata(model_definition, train_set_metadata)

    if is_on_master():
        if not skip_save_model:
            # save train set metadata
            os.makedirs(model_dir, exist_ok=True)
            save_json(os.path.join(model_dir, TRAIN_SET_METADATA_FILE_NAME),
                      train_set_metadata)

    # run the experiment
    model, result = train(training_set=training_set,
                          validation_set=validation_set,
                          test_set=test_set,
                          model_definition=model_definition,
                          save_path=model_dir,
                          model_load_path=model_load_path,
                          resume=model_resume_path is not None,
                          skip_save_model=skip_save_model,
                          skip_save_progress=skip_save_progress,
                          skip_save_log=skip_save_log,
                          gpus=gpus,
                          gpu_fraction=gpu_fraction,
                          use_horovod=use_horovod,
                          random_seed=random_seed,
                          debug=debug)

    train_trainset_stats, train_valisest_stats, train_testset_stats = result
    train_stats = {
        'train': train_trainset_stats,
        'validation': train_valisest_stats,
        'test': train_testset_stats
    }

    if should_close_session:
        model.close_session()

    if is_on_master():
        # save training and test statistics
        save_json(training_stats_fn, train_stats)

    # grab the results of the model with highest validation test performance
    validation_field = model_definition['training']['validation_field']
    validation_measure = model_definition['training']['validation_measure']
    validation_field_result = train_valisest_stats[validation_field]

    best_function = get_best_function(validation_measure)
    # results of the model with highest validation test performance
    if is_on_master() and validation_set is not None:
        epoch_best_vali_measure, best_vali_measure = best_function(
            enumerate(validation_field_result[validation_measure]),
            key=lambda pair: pair[1])
        logger.info(
            'Best validation model epoch: {0}'.format(epoch_best_vali_measure +
                                                      1))
        logger.info(
            'Best validation model {0} on validation set {1}: {2}'.format(
                validation_measure, validation_field, best_vali_measure))
        if test_set is not None:
            best_vali_measure_epoch_test_measure = train_testset_stats[
                validation_field][validation_measure][epoch_best_vali_measure]

            logger.info(
                'Best validation model {0} on test set {1}: {2}'.format(
                    validation_measure, validation_field,
                    best_vali_measure_epoch_test_measure))
        logger.info('\nFinished: {0}_{1}'.format(experiment_name, model_name))
        logger.info('Saved to: {0}'.format(experiment_dir_name))

    contrib_command("train_save", experiment_dir_name)

    return (model, preprocessed_data, experiment_dir_name, train_stats,
            model_definition)
Beispiel #17
0
    def train(self,
              data_df=None,
              data_train_df=None,
              data_validation_df=None,
              data_test_df=None,
              data_csv=None,
              data_train_csv=None,
              data_validation_csv=None,
              data_test_csv=None,
              data_hdf5=None,
              data_train_hdf5=None,
              data_validation_hdf5=None,
              data_test_hdf5=None,
              train_set_metadata_json=None,
              dataset_type='generic',
              model_name='run',
              model_load_path=None,
              model_resume_path=None,
              skip_save_model=False,
              skip_save_progress=False,
              skip_save_log=False,
              skip_save_processed_input=False,
              output_directory='results',
              gpus=None,
              gpu_fraction=1.0,
              random_seed=42,
              logging_level=logging.ERROR,
              debug=False,
              **kwargs):
        """This function is used to perform a full training of the model on the 
           specified dataset.

        # Inputs

        :param data_df: (DataFrame) dataframe containing data. If it has a split
               column, it will be used for splitting (0: train, 1: validation,
               2: test), otherwise the dataset will be randomly split
        :param data_train_df: (DataFrame) dataframe containing training data
        :param data_validation_df: (DataFrame) dataframe containing validation
               data
        :param data_test_df: (DataFrame dataframe containing test data
        :param data_csv: (string) input data CSV file. If it has a split column,
               it will be used for splitting (0: train, 1: validation, 2: test),
               otherwise the dataset will be randomly split
        :param data_train_csv: (string) input train data CSV file
        :param data_validation_csv: (string) input validation data CSV file
        :param data_test_csv: (string) input test data CSV file
        :param data_hdf5: (string) input data HDF5 file. It is an intermediate
               preprocess  version of the input CSV created the first time a CSV
               file is used in the same directory with the same name and a hdf5
               extension
        :param data_train_hdf5: (string) input train data HDF5 file. It is an
               intermediate preprocess  version of the input CSV created the
               first time a CSV file is used in the same directory with the same
               name and a hdf5 extension
        :param data_validation_hdf5: (string) input validation data HDF5 file.
               It is an intermediate preprocess version of the input CSV created
               the first time a CSV file is used in the same directory with the
               same name and a hdf5 extension
        :param data_test_hdf5: (string) input test data HDF5 file. It is an
               intermediate preprocess  version of the input CSV created the
               first time a CSV file is used in the same directory with the same
               name and a hdf5 extension
        :param train_set_metadata_json: (string) input metadata JSON file. It is an
               intermediate preprocess file containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a json extension
        :param dataset_type: (string, default: `'default'`) determines the type
               of preprocessing will be applied to the data. Only `generic` is
               available at the moment
        :param model_name: (string) a name for the model, user for the save
               directory
        :param model_load_path: (string) path of a pretrained model to load as
               initialization
        :param model_resume_path: (string) path of a the model directory to
               resume training of
        :param skip_save_model: (bool, default: `False`) disables
               saving model weights and hyperparameters each time the model
               improves. By default Ludwig saves model weights after each epoch
               the validation measure imrpvoes, but if the model is really big
               that can be time consuming if you do not want to keep
               the weights and just find out what performance can a model get
               with a set of hyperparameters, use this parameter to skip it,
               but the model will not be loadable later on.
        :param skip_save_progress: (bool, default: `False`) disables saving
               progress each epoch. By default Ludwig saves weights and stats
               after each epoch for enabling resuming of training, but if
               the model is really big that can be time consuming and will uses
               twice as much space, use this parameter to skip it, but training
               cannot be resumed later on.
        :param skip_save_log: (bool, default: `False`) disables saving TensorBoard
               logs. By default Ludwig saves logs for the TensorBoard, but if it
               is not needed turning it off can slightly increase the
               overall speed.
        :param skip_save_processed_input: (bool, default: `False`) skips saving
               intermediate HDF5 and JSON files
        :param output_directory: (string, default: `'results'`) directory that
               contains the results
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_fraction: (float, default `1.0`) fraction of gpu memory to
               initialize the process with
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode
        :param logging_level: (int, default: `logging.ERROR`) logging level to
               use for logging. Use logging constants like `logging.DEBUG`,
               `logging.INFO` and `logging.ERROR`. By default only errors will
               be printed.

        There are three ways to provide data: by dataframes using the `_df`
        parameters, by CSV using the `_csv` parameters and by HDF5 and JSON,
        using `_hdf5` and `_json` parameters.
        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while HDF5 and
        JSON load previously preprocessed HDF5 and JSON files (they are saved in
        the same directory of the CSV they are obtained from).
        For all three approaches either a full dataset can be provided (which
        will be split randomly according to the split probabilities defined in
        the model definition, by default 70% training, 10% validation and 20%
        test) or, if it contanins a plit column, it will be plit according to
        that column (interpreting 0 as training, 1 as validation and 2 as test).
        Alternatively separated dataframes / CSV / HDF5 files can beprovided
        for each split.

        During training the model and statistics will be saved in a directory
        `[output_dir]/[experiment_name]_[model_name]_n` where all variables are
        resolved to user spiecified ones and `n` is an increasing number
        starting from 0 used to differentiate different runs.


        # Return

        :return: (dict) a dictionary containing training statistics for each
        output feature containing loss and measures values for each epoch.

        """
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        # setup directories and file names
        experiment_dir_name = None
        if model_resume_path is not None:
            if os.path.exists(model_resume_path):
                experiment_dir_name = model_resume_path
            else:
                logging.info('Model resume path does not exists,'
                             ' starting training from scratch')
                model_resume_path = None
        if model_resume_path is None:
            experiment_dir_name = get_experiment_dir_name(
                output_directory, '', model_name)
        description_fn, training_stats_fn, model_dir = get_file_names(
            experiment_dir_name)

        # save description
        description = get_experiment_description(
            self.model_definition,
            dataset_type,
            data_csv=data_csv,
            data_train_csv=data_train_csv,
            data_validation_csv=data_validation_csv,
            data_test_csv=data_test_csv,
            data_hdf5=data_hdf5,
            data_train_hdf5=data_train_hdf5,
            data_validation_hdf5=data_validation_hdf5,
            data_test_hdf5=data_test_hdf5,
            metadata_json=train_set_metadata_json,
            random_seed=random_seed)

        save_json(description_fn, description)

        # print description
        logging.info('Model name: {}'.format(model_name))
        logging.info('Output path: {}'.format(experiment_dir_name))
        logging.info('\n')
        for key, value in description.items():
            logging.info('{0}: {1}'.format(key, pformat(value, indent=4)))
        logging.info('\n')

        # preprocess
        if data_df is not None or data_train_df is not None:
            (training_set, validation_set, test_set,
             train_set_metadata) = preprocess_for_training(
                 self.model_definition,
                 dataset_type,
                 data_df=data_df,
                 data_train_df=data_train_df,
                 data_validation_df=data_validation_df,
                 data_test_df=data_test_df,
                 train_set_metadata_json=train_set_metadata_json,
                 skip_save_processed_input=True,
                 preprocessing_params=self.model_definition['preprocessing'],
                 random_seed=random_seed)
        else:
            (training_set, validation_set, test_set,
             train_set_metadata) = preprocess_for_training(
                 self.model_definition,
                 dataset_type,
                 data_csv=data_csv,
                 data_train_csv=data_train_csv,
                 data_validation_csv=data_validation_csv,
                 data_test_csv=data_test_csv,
                 data_hdf5=data_hdf5,
                 data_train_hdf5=data_train_hdf5,
                 data_validation_hdf5=data_validation_hdf5,
                 data_test_hdf5=data_test_hdf5,
                 train_set_metadata_json=train_set_metadata_json,
                 skip_save_processed_input=skip_save_processed_input,
                 preprocessing_params=self.model_definition['preprocessing'],
                 random_seed=random_seed)

        logging.info('Training set: {0}'.format(training_set.size))
        if validation_set is not None:
            logging.info('Validation set: {0}'.format(validation_set.size))
        if test_set is not None:
            logging.info('Test set: {0}'.format(test_set.size))

        # update model definition with metadata properties
        update_model_definition_with_metadata(self.model_definition,
                                              train_set_metadata)

        if not skip_save_model:
            os.makedirs(model_dir, exist_ok=True)
            train_set_metadata_path = os.path.join(
                model_dir, TRAIN_SET_METADATA_FILE_NAME)
            save_json(train_set_metadata_path, train_set_metadata)

        # run the experiment
        model, result = train(training_set=training_set,
                              validation_set=validation_set,
                              test_set=test_set,
                              model_definition=self.model_definition,
                              save_path=model_dir,
                              model_load_path=model_load_path,
                              resume=model_resume_path is not None,
                              skip_save_model=skip_save_model,
                              skip_save_progress=skip_save_progress,
                              skip_save_log=skip_save_log,
                              gpus=gpus,
                              gpu_fraction=gpu_fraction,
                              random_seed=random_seed,
                              debug=debug)

        train_trainset_stats, train_valisest_stats, train_testset_stats = result
        train_stats = {
            'train': train_trainset_stats,
            'validation': train_valisest_stats,
            'test': train_testset_stats
        }

        # save training and test statistics
        save_json(training_stats_fn, train_stats)

        # grab the results of the model with highest validation test performance
        md_training = self.model_definition['training']
        validation_field = md_training['validation_field']
        validation_measure = md_training['validation_measure']
        validation_field_result = train_valisest_stats[validation_field]

        best_function = get_best_function(validation_measure)

        # print results of the model with highest validation test performance
        if validation_set is not None:
            # max or min depending on the measure
            epoch_best_vali_measure, best_vali_measure = best_function(
                enumerate(validation_field_result[validation_measure]),
                key=lambda pair: pair[1])
            logging.info('Best validation model epoch: {0}'.format(
                epoch_best_vali_measure + 1))
            logging.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_measure, validation_field, best_vali_measure))

            if test_set is not None:
                best_vali_measure_epoch_test_measure = train_testset_stats[
                    validation_field][validation_measure][
                        epoch_best_vali_measure]
                logging.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_measure, validation_field,
                        best_vali_measure_epoch_test_measure))

        logging.info('Finished: {0}'.format(model_name))
        logging.info('Saved to {0}:'.format(experiment_dir_name))

        # set parameters
        self.model = model
        self.train_set_metadata = train_set_metadata

        return train_stats
Beispiel #18
0
def save_hyperopt_stats(hyperopt_stats, hyperopt_dir_name):
    hyperopt_stats_fn = os.path.join(hyperopt_dir_name,
                                     'hyperopt_statistics.json')
    save_json(hyperopt_stats_fn, hyperopt_stats)
Beispiel #19
0
def save_hyperopt_stats(hyperopt_stats, hyperopt_dir_name):
    hyperopt_stats_fn = os.path.join(hyperopt_dir_name,
                                     HYPEROPT_STATISTICS_FILE_NAME)
    save_json(hyperopt_stats_fn, hyperopt_stats)
Beispiel #20
0
def write_meta(df, path, compression_type):
    meta = {'size': len(df.index), 'compression_type': compression_type or ''}
    save_json(os.path.join(path, "meta.json"), meta)
Beispiel #21
0
def kfold_cross_validate(k_fold,
                         model_definition=None,
                         model_definition_file=None,
                         data_csv=None,
                         output_directory='results',
                         random_seed=default_random_seed,
                         skip_save_k_fold_split_indices=False,
                         **kwargs):
    """Performs k-fold cross validation.

    # Inputs
    :param k_fold: (int) number of folds to create for the cross-validation
    :param model_definition: (dict, default: None) a dictionary containing
            information needed to build a model. Refer to the [User Guide]
           (http://ludwig.ai/user_guide/#model-definition) for details.
    :param model_definition_file: (string, optional, default: `None`) path to
           a YAML file containing the model definition. If available it will be
           used instead of the model_definition dict.
    :param data_csv: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.
    :param skip_save_k_fold_split_indices: (boolean, default: False) Disables
            saving k-fold split indices

    :return: None
    """

    # check for model_definition and model_definition_file
    if model_definition is None and model_definition_file is None:
        raise ValueError(
            'Either model_definition of model_definition_file have to be'
            'not None to initialize a LudwigModel')
    if model_definition is not None and model_definition_file is not None:
        raise ValueError('Only one between model_definition and '
                         'model_definition_file can be provided')

    # check for k_fold
    if k_fold is None:
        raise ValueError('k_fold parameter must be specified')

    logger.info('starting {:d}-fold cross validation'.format(k_fold))

    # create output_directory if not available
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)

    # read in data to split for the folds
    data_df = pd.read_csv(data_csv)

    # place each fold in a separate directory
    data_dir = os.path.dirname(data_csv)
    kfold_training_stats = {}
    kfold_split_indices = {}
    for train_indices, test_indices, fold_num in \
            generate_kfold_splits(data_df, k_fold, random_seed):
        with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name:
            curr_train_df = data_df.iloc[train_indices]
            curr_test_df = data_df.iloc[test_indices]

            if not skip_save_k_fold_split_indices:
                kfold_split_indices['fold_' + str(fold_num)] = {
                    'training_indices': train_indices,
                    'test_indices': test_indices
                }

            # train and validate model on this fold
            if model_definition_file is not None:
                with open(model_definition_file, 'r') as def_file:
                    model_definition = \
                        merge_with_defaults(yaml.safe_load(def_file))
            logger.info("training on fold {:d}".format(fold_num))
            (model, preprocessed_data, _, train_stats,
             model_definition) = full_train(model_definition,
                                            data_train_df=curr_train_df,
                                            data_test_df=curr_test_df,
                                            experiment_name='cross_validation',
                                            model_name='fold_' + str(fold_num),
                                            output_directory=os.path.join(
                                                temp_dir_name, 'results'))

            # score on hold out fold
            eval_batch_size = model_definition['training']['eval_batch_size']
            batch_size = model_definition['training']['batch_size']
            preds = model.predict(
                preprocessed_data[2],
                eval_batch_size if eval_batch_size != 0 else batch_size)

            # augment the training statistics with scoring metric fron
            # the hold out fold
            train_stats['fold_metric'] = {}
            for metric_category in preds:
                train_stats['fold_metric'][metric_category] = {}
                for metric in preds[metric_category]:
                    train_stats['fold_metric'][metric_category][metric] = \
                        preds[metric_category][metric]

            # collect training statistics for this fold
            kfold_training_stats['fold_' + str(fold_num)] = train_stats

    # consolidate raw fold metrics across all folds
    raw_kfold_stats = {}
    for fold_name in kfold_training_stats:
        for category in kfold_training_stats[fold_name]['fold_metric']:
            if category not in raw_kfold_stats:
                raw_kfold_stats[category] = {}
            category_stats = \
                kfold_training_stats[fold_name]['fold_metric'][category]
            for metric in category_stats:
                if metric not in {'predictions', 'probabilities'}:
                    if metric not in raw_kfold_stats[category]:
                        raw_kfold_stats[category][metric] = []
                    raw_kfold_stats[category][metric] \
                        .append(category_stats[metric])

    # calculate overall kfold statistics
    overall_kfold_stats = {}
    for category in raw_kfold_stats:
        overall_kfold_stats[category] = {}
        for metric in raw_kfold_stats[category]:
            mean = np.mean(raw_kfold_stats[category][metric])
            std = np.std(raw_kfold_stats[category][metric])
            overall_kfold_stats[category][metric + '_mean'] = mean
            overall_kfold_stats[category][metric + '_std'] = std

    kfold_training_stats['overall'] = overall_kfold_stats

    # save k-fold cv statistics
    save_json(os.path.join(output_directory, 'kfold_training_statistics.json'),
              kfold_training_stats)

    # save k-fold split indices
    if not skip_save_k_fold_split_indices:
        save_json(os.path.join(output_directory, 'kfold_split_indices.json'),
                  kfold_split_indices)

    logger.info('completed {:d}-fold cross validation'.format(k_fold))
Beispiel #22
0
def experiment(
        model_definition,
        model_definition_file=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        experiment_name='experiment',
        model_name='run',
        model_load_path=None,
        model_resume_path=None,
        skip_save_progress_weights=False,
        skip_save_processed_input=False,
        skip_save_unprocessed_output=False,
        output_directory='results',
        gpus=None,
        gpu_fraction=1.0,
        use_horovod=False,
        random_seed=default_random_seed,
        debug=False,
        **kwargs
):
    """Trains a model on a dataset's training and validation splits and
    uses it to predict on the test split.
    It saves the trained model and the statistics of training and testing.
    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param model_definition_file: The file that specifies the model definition.
           It is a yaml file.
    :type model_definition_file: filepath (str)
    :param data_csv: A CSV file contanining the input data which is used to
           train, validate and test a model. The CSV either contains a
           split column or will be split.
    :type data_csv: filepath (str)
    :param data_train_csv: A CSV file contanining the input data which is used
           to train a model.
    :type data_train_csv: filepath (str)
    :param data_validation_csv: A CSV file contanining the input data which is used
           to validate a model..
    :type data_validation_csv: filepath (str)
    :param data_test_csv: A CSV file contanining the input data which is used
           to test a model.
    :type data_test_csv: filepath (str)
    :param data_hdf5: If the dataset is in the hdf5 format, this is used instead
           of the csv file.
    :type data_hdf5: filepath (str)
    :param data_train_hdf5: If the training set is in the hdf5 format, this is
           used instead of the csv file.
    :type data_train_hdf5: filepath (str)
    :param data_validation_hdf5: If the validation set is in the hdf5 format,
           this is used instead of the csv file.
    :type data_validation_hdf5: filepath (str)
    :param data_test_hdf5: If the test set is in the hdf5 format, this is
           used instead of the csv file.
    :type data_test_hdf5: filepath (str)
    :param train_set_metadata_json: If the dataset is in hdf5 format, this is
           the associated json file containing metadata.
    :type train_set_metadata_json: filepath (str)
    :param experiment_name: The name for the experiment.
    :type experiment_name: Str
    :param model_name: Name of the model that is being used.
    :type model_name: Str
    :param model_load_path: If this is specified the loaded model will be used
           as initialization (useful for transfer learning).
    :type model_load_path: filepath (str)
    :param model_resume_path: Resumes training of the model from the path
           specified. The difference with model_load_path is that also training
           statistics like the current epoch and the loss and performance so
           far are also resumed effectively cotinuing a previously interrupted
           training process.
    :type model_resume_path: filepath (str)
    :param skip_save_progress_weights: Skips saving the weights at the end of
           each epoch. If this is true, training cannot be resumed from the
           exactly the state at the end of the previous epoch.
    :type skip_save_progress_weights: Boolean
    :param skip_save_processed_input: If a CSV dataset is provided it is
           preprocessed and then saved as an hdf5 and json to avoid running
           the preprocessing again. If this parameter is False,
           the hdf5 and json file are not saved.
    :type skip_save_processed_input: Boolean
    :param skip_save_unprocessed_output: By default predictions and
           their probabilities are saved in both raw unprocessed numpy files
           contaning tensors and as postprocessed CSV files
           (one for each output feature). If this parameter is True,
           only the CSV ones are saved and the numpy ones are skipped.
    :type skip_save_unprocessed_output: Boolean
    :param output_directory: The directory that will contanin the training
           statistics, the saved model and the training procgress files.
    :type output_directory: filepath (str)
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_fraction: Fraction of the memory of each GPU to use at
           the beginning of the training. The memory may grow elastically.
    :type gpu_fraction: Integer
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean
    """
    # set input features defaults
    if model_definition_file is not None:
        with open(model_definition_file, 'r') as def_file:
            model_definition = merge_with_defaults(yaml.load(def_file))
    else:
        model_definition = merge_with_defaults(model_definition)

    # setup directories and file names
    experiment_dir_name = None
    if model_resume_path is not None:
        if os.path.exists(model_resume_path):
            experiment_dir_name = model_resume_path
        else:
            if is_on_master():
                logging.info(
                    'Model resume path does not exists, '
                    'starting training from scratch'
                )
            model_resume_path = None

    if model_resume_path is None:
        if is_on_master():
            experiment_dir_name = get_experiment_dir_name(
                output_directory,
                experiment_name,
                model_name
            )
        else:
            experiment_dir_name = '/'
    description_fn, training_stats_fn, model_dir = get_file_names(
        experiment_dir_name
    )

    # save description
    description = get_experiment_description(
        model_definition,
        data_csv,
        data_train_csv,
        data_validation_csv,
        data_test_csv,
        data_hdf5,
        data_train_hdf5,
        data_validation_hdf5,
        data_test_hdf5,
        train_set_metadata_json,
        random_seed
    )
    if is_on_master():
        save_json(description_fn, description)
        # print description
        logging.info('Experiment name: {}'.format(experiment_name))
        logging.info('Model name: {}'.format(model_name))
        logging.info('Output path: {}'.format(experiment_dir_name))
        logging.info('')
        for key, value in description.items():
            logging.info('{}: {}'.format(key, pformat(value, indent=4)))
        logging.info('')

    # preprocess
    (
        training_set,
        validation_set,
        test_set,
        train_set_metadata
    ) = preprocess_for_training(
        model_definition,
        data_csv=data_csv,
        data_train_csv=data_train_csv,
        data_validation_csv=data_validation_csv,
        data_test_csv=data_test_csv,
        data_hdf5=data_hdf5,
        data_train_hdf5=data_train_hdf5,
        data_validation_hdf5=data_validation_hdf5,
        data_test_hdf5=data_test_hdf5,
        train_set_metadata_json=train_set_metadata_json,
        skip_save_processed_input=skip_save_processed_input,
        preprocessing_params=model_definition[
            'preprocessing'],
        random_seed=random_seed
    )
    if is_on_master():
        logging.info('Training set: {0}'.format(training_set.size))
        if validation_set is not None:
            logging.info('Validation set: {0}'.format(validation_set.size))
        if test_set is not None:
            logging.info('Test set: {0}'.format(test_set.size))

    # update model definition with metadata properties
    update_model_definition_with_metadata(model_definition, train_set_metadata)

    # run the experiment
    model, training_results = train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        model_definition=model_definition,
        save_path=model_dir,
        model_load_path=model_load_path,
        resume=model_resume_path is not None,
        skip_save_progress_weights=skip_save_progress_weights,
        gpus=gpus,
        gpu_fraction=gpu_fraction,
        use_horovod=use_horovod,
        random_seed=random_seed,
        debug=debug
    )
    (
        train_trainset_stats,
        train_valisest_stats,
        train_testset_stats
    ) = training_results

    if is_on_master():
        # save train set metadata
        save_json(
            os.path.join(
                model_dir,
                TRAIN_SET_METADATA_FILE_NAME
            ),
            train_set_metadata
        )

    # grab the results of the model with highest validation test performance
    validation_field = model_definition['training']['validation_field']
    validation_measure = model_definition['training']['validation_measure']
    validation_field_result = train_valisest_stats[validation_field]

    best_function = get_best_function(validation_measure)

    # print results of the model with highest validation test performance
    if is_on_master():
        if validation_set is not None:
            # max or min depending on the measure
            epoch_best_vali_measure, best_vali_measure = best_function(
                enumerate(validation_field_result[validation_measure]),
                key=lambda pair: pair[1]
            )
            logging.info('Best validation model epoch: {0}'.format(
                epoch_best_vali_measure + 1)
            )
            logging.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_measure,
                    validation_field,
                    best_vali_measure)
            )
        
            if test_set is not None:
                best_vali_measure_epoch_test_measure = train_testset_stats[
                    validation_field
                ][validation_measure][epoch_best_vali_measure]
                logging.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_measure,
                        validation_field,
                        best_vali_measure_epoch_test_measure
                    )
                )

    # save training statistics
    if is_on_master():
        save_json(
            training_stats_fn,
            {
                'train': train_trainset_stats,
                'validation': train_valisest_stats,
                'test': train_testset_stats
            }
        )

    
    if test_set is not None:
        # predict
        test_results = predict(
            test_set,
            train_set_metadata,
            model,
            model_definition,
            model_definition['training']['batch_size'],
            only_predictions=False,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            debug=debug
        )
        # postprocess
        postprocessed_output = postprocess(
            test_results,
            model_definition['output_features'],
            train_set_metadata,
            experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master()
        )

        if is_on_master():
            print_prediction_results(test_results)

            save_prediction_outputs(postprocessed_output, experiment_dir_name)
            save_prediction_statistics(test_results, experiment_dir_name)
    
    model.close_session()

    if is_on_master():
        logging.info('\nFinished: {0}_{1}'.format(
            experiment_name, model_name))
        logging.info('Saved to: {}'.format(experiment_dir_name))

    return experiment_dir_name
Beispiel #23
0
def save_test_statistics(test_stats, experiment_dir_name):
    test_stats_fn = os.path.join(experiment_dir_name, 'test_statistics.json')
    save_json(test_stats_fn, test_stats)
Beispiel #24
0
def preprocess_for_training(
        model_definition,
        dataset_type='generic',
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    # Check if hdf5 and json already exist
    data_hdf5_fp = None
    data_train_hdf5_fp = None
    data_validation_hdf5_fp = None
    data_test_hdf5_fp = None
    train_set_metadata_json_fp = 'metadata.json'
    if data_csv is not None:
        data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json'
        if (os.path.isfile(data_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename '
                         'of the csv, using them instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_train_csv is not None:
        data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(
            data_train_csv)[0] + '.json'
        if (os.path.isfile(data_train_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename of '
                         'the train csv, using them instead')
            data_train_csv = None
            data_train_hdf5 = data_train_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_validation_csv is not None:
        data_validation_hdf5_fp = os.path.splitext(
            data_validation_csv)[0] + '.hdf5'
        if os.path.isfile(data_validation_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_validation_csv = None
            data_validation_hdf5 = data_validation_hdf5_fp

    if data_test_csv is not None:
        data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5'
        if os.path.isfile(data_test_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_test_csv = None
            data_test_hdf5 = data_test_hdf5_fp

    model_definition['data_hdf5_fp'] = data_hdf5_fp

    # Decide if to preprocess or just load
    features = (model_definition['input_features'] +
                model_definition['output_features'])
    (concatenate_csv, concatenate_df, build_dataset,
     build_dataset_df) = get_dataset_fun(dataset_type)

    if data_df is not None:
        # needs preprocessing
        logging.info('Using full dataframe')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset_df(data_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_df is not None:
        # needs preprocessing
        logging.info('Using training dataframe')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_df(data_train_df, data_validation_df,
                                         data_test_df)
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    elif data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logging.info('Using full raw csv, no hdf5 and json file '
                     'with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(data_csv,
                                                 features,
                                                 preprocessing_params,
                                                 random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logging.info('Using training raw csv, no hdf5 and json '
                     'file with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(data_train_csv, data_validation_csv,
                                          data_test_csv)
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)

    elif data_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using full hdf5 and json')
        training_set, test_set, validation_set = load_data(
            data_hdf5,
            model_definition['input_features'],
            model_definition['output_features'],
            shuffle_training=True)
        train_set_metadata = load_metadata(train_set_metadata_json)

    elif data_train_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using hdf5 and json')
        training_set = load_data(data_train_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        train_set_metadata = load_metadata(train_set_metadata_json)
        if data_validation_hdf5 is not None:
            validation_set = load_data(data_validation_hdf5,
                                       model_definition['input_features'],
                                       model_definition['output_features'],
                                       split_data=False)
        else:
            validation_set = None
        if data_test_hdf5 is not None:
            test_set = load_data(data_test_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        else:
            test_set = None

    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(model_definition,
                               [training_set, validation_set, test_set])

    training_dataset = Dataset(training_set,
                               model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(validation_set,
                                     model_definition['input_features'],
                                     model_definition['output_features'],
                                     data_hdf5_fp)

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(test_set, model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    return (training_dataset, validation_dataset, test_dataset,
            train_set_metadata)
Beispiel #25
0
 def save(self, filepath):
     save_json(filepath, self.__dict__)
Beispiel #26
0
def save_prediction_statistics(prediction_stats, experiment_dir_name):
    test_stats_fn = os.path.join(experiment_dir_name,
                                 'prediction_statistics.json')
    save_json(test_stats_fn, prediction_stats)
Beispiel #27
0
    def train(
            self,
            dataset=None,
            training_set=None,
            validation_set=None,
            test_set=None,
            training_set_metadata=None,
            data_format=None,
            experiment_name='api_experiment',
            model_name='run',
            model_resume_path=None,
            skip_save_training_description=False,
            skip_save_training_statistics=False,
            skip_save_model=False,
            skip_save_progress=False,
            skip_save_log=False,
            skip_save_processed_input=False,
            output_directory='results',
            random_seed=default_random_seed,
            debug=False,
            **kwargs
    ):
        """This function is used to perform a full training of the model on the
           specified dataset.

        # Inputs

        :param dataset: (string, dict, DataFrame) source containing the entire dataset.
               If it has a split column, it will be used for splitting (0: train,
               1: validation, 2: test), otherwise the dataset will be randomly split.
        :param training_set: (string, dict, DataFrame) source containing training data.
        :param validation_set: (string, dict, DataFrame) source containing validation data.
        :param test_set: (string, dict, DataFrame) source containing test data.
        :param training_set_metadata: (string, dict) metadata JSON file or loaded metadata.
               Intermediate preprocess structure containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a '.json' extension.
        :param data_format: (string) format to interpret data sources. Will be inferred
               automatically if not specified.
        :param experiment_name: (string) a name for the experiment, used for the save
               directory
        :param model_name: (string) a name for the model, used for the save
               directory
        :param model_resume_path: (string) path of a the model directory to
               resume training of
        :param skip_save_training_description: (bool, default: `False`) disables
               saving the description JSON file.
        :param skip_save_training_statistics: (bool, default: `False`) disables
               saving training statistics JSON file.
        :param skip_save_model: (bool, default: `False`) disables
               saving model weights and hyperparameters each time the model
               improves. By default Ludwig saves model weights after each epoch
               the validation metric imrpvoes, but if the model is really big
               that can be time consuming if you do not want to keep
               the weights and just find out what performance can a model get
               with a set of hyperparameters, use this parameter to skip it,
               but the model will not be loadable later on.
        :param skip_save_progress: (bool, default: `False`) disables saving
               progress each epoch. By default Ludwig saves weights and stats
               after each epoch for enabling resuming of training, but if
               the model is really big that can be time consuming and will uses
               twice as much space, use this parameter to skip it, but training
               cannot be resumed later on.
        :param skip_save_log: (bool, default: `False`) disables saving TensorBoard
               logs. By default Ludwig saves logs for the TensorBoard, but if it
               is not needed turning it off can slightly increase the
               overall speed.
        :param skip_save_processed_input: (bool, default: `False`) skips saving
               intermediate HDF5 and JSON files
        :param output_directory: (string, default: `'results'`) directory that
               contains the results
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode

        There are three ways to provide data: by dataframes using the `_df`
        parameters, by CSV using the `_csv` parameters and by HDF5 and JSON,
        using `_hdf5` and `_json` parameters.
        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while HDF5 and
        JSON load previously preprocessed HDF5 and JSON files (they are saved in
        the same directory of the CSV they are obtained from).
        For all three approaches either a full dataset can be provided (which
        will be split randomly according to the split probabilities defined in
        the model definition, by default 70% training, 10% validation and 20%
        test) or, if it contanins a plit column, it will be plit according to
        that column (interpreting 0 as training, 1 as validation and 2 as test).
        Alternatively separated dataframes / CSV / HDF5 files can beprovided
        for each split.

        During training the model and statistics will be saved in a directory
        `[output_dir]/[experiment_name]_[model_name]_n` where all variables are
        resolved to user spiecified ones and `n` is an increasing number
        starting from 0 used to differentiate different runs.


        # Return

        :return: ((dict, DataFrame)) tuple containing:
            - A dictionary of training statistics for each output feature containing
              loss and metrics values for each epoch. The second return
            - A Pandas DataFrame of preprocessed training data.
        """
        # setup directories and file names
        if model_resume_path is not None:
            if os.path.exists(model_resume_path):
                output_directory = model_resume_path
            else:
                if is_on_master():
                    logger.info(
                        'Model resume path does not exists, '
                        'starting training from scratch'
                    )
                model_resume_path = None

        if model_resume_path is None:
            if is_on_master():
                output_directory = get_output_directory(
                    output_directory,
                    experiment_name,
                    model_name
                )
            else:
                output_directory = None

        # if we are skipping all saving,
        # there is no need to create a directory that will remain empty
        should_create_output_directory = not (
                skip_save_training_description and
                skip_save_training_statistics and
                skip_save_model and
                skip_save_progress and
                skip_save_log and
                skip_save_processed_input
        )

        description_fn = training_stats_fn = model_dir = None
        if is_on_master():
            if should_create_output_directory:
                if not os.path.exists(output_directory):
                    os.makedirs(output_directory, exist_ok=True)
            description_fn, training_stats_fn, model_dir = get_file_names(
                output_directory)

        # save description
        if is_on_master():
            description = get_experiment_description(
                self.model_definition,
                dataset=dataset,
                training_set=training_set,
                validation_set=validation_set,
                test_set=test_set,
                training_set_metadata=training_set_metadata,
                data_format=data_format,
                random_seed=random_seed
            )
            if not skip_save_training_description:
                save_json(description_fn, description)
            # print description
            logger.info('Experiment name: {}'.format(experiment_name))
            logger.info('Model name: {}'.format(model_name))
            logger.info('Output directory: {}'.format(output_directory))
            logger.info('\n')
            for key, value in description.items():
                logger.info('{}: {}'.format(key, pformat(value, indent=4)))
            logger.info('\n')

        # preprocess
        preprocessed_data = preprocess_for_training(
            self.model_definition,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=skip_save_processed_input,
            preprocessing_params=self.model_definition[PREPROCESSING],
            random_seed=random_seed
        )

        (training_set,
         validation_set,
         test_set,
         training_set_metadata) = preprocessed_data
        self.training_set_metadata = training_set_metadata

        if is_on_master():
            logger.info('Training set: {0}'.format(training_set.size))
            if validation_set is not None:
                logger.info('Validation set: {0}'.format(validation_set.size))
            if test_set is not None:
                logger.info('Test set: {0}'.format(test_set.size))

        if is_on_master():
            if not skip_save_model:
                # save train set metadata
                os.makedirs(model_dir, exist_ok=True)
                save_json(
                    os.path.join(
                        model_dir,
                        TRAIN_SET_METADATA_FILE_NAME
                    ),
                    training_set_metadata
                )

        contrib_command("train_init", experiment_directory=output_directory,
                        experiment_name=experiment_name, model_name=model_name,
                        output_directory=output_directory,
                        resume=model_resume_path is not None)

        # Build model if not provided
        # if it was provided it means it was already loaded
        if not self.model:
            if is_on_master():
                print_boxed('MODEL', print_fun=logger.debug)
            # update model definition with metadata properties
            update_model_definition_with_metadata(
                self.model_definition,
                training_set_metadata
            )
            self.model = LudwigModel.create_model(self.model_definition,
                                                  random_seed=random_seed)

        # init trainer
        trainer = Trainer(
            **self.model_definition[TRAINING],
            resume=model_resume_path is not None,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            random_seed=random_seed,
            horoovd=self._horovod,
            debug=debug
        )

        contrib_command("train_model", self.model, self.model_definition,
                        self.model_definition_fp)

        # train model
        if is_on_master():
            print_boxed('TRAINING')
            if not skip_save_model:
                self.save_model_definition(model_dir)

        train_stats = trainer.train(
            self.model,
            training_set,
            validation_set=validation_set,
            test_set=test_set,
            save_path=model_dir,
        )

        train_trainset_stats, train_valiset_stats, train_testset_stats = train_stats
        train_stats = {
            TRAINING: train_trainset_stats,
            VALIDATION: train_valiset_stats,
            TEST: train_testset_stats
        }

        # save training statistics
        if is_on_master():
            if not skip_save_training_statistics:
                save_json(training_stats_fn, train_stats)

        # grab the results of the model with highest validation test performance
        validation_field = trainer.validation_field
        validation_metric = trainer.validation_metric
        validation_field_result = train_valiset_stats[validation_field]

        best_function = get_best_function(validation_metric)
        # results of the model with highest validation test performance
        if is_on_master() and validation_set is not None:
            epoch_best_vali_metric, best_vali_metric = best_function(
                enumerate(validation_field_result[validation_metric]),
                key=lambda pair: pair[1]
            )
            logger.info(
                'Best validation model epoch: {0}'.format(
                    epoch_best_vali_metric + 1)
            )
            logger.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_metric, validation_field, best_vali_metric
                ))
            if test_set is not None:
                best_vali_metric_epoch_test_metric = train_testset_stats[
                    validation_field][validation_metric][
                    epoch_best_vali_metric]

                logger.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_metric,
                        validation_field,
                        best_vali_metric_epoch_test_metric
                    )
                )
            logger.info(
                '\nFinished: {0}_{1}'.format(experiment_name, model_name))
            logger.info('Saved to: {0}'.format(output_directory))

        contrib_command("train_save", output_directory)

        self.training_set_metadata = training_set_metadata

        if not skip_save_model:
            # Load the best weights from saved checkpoint
            self.load_weights(model_dir)

        return train_stats, preprocessed_data, output_directory
Beispiel #28
0
        type=yaml.safe_load,
        default='{}',
        help='the parameters for preprocessing the different features'
    )

    parser.add_argument(
        '-rs',
        '--random_seed',
        type=int,
        default=42,
        help='a random seed that is going to be used anywhere there is a call '
             'to a random number generator: data splitting, parameter '
             'initialization and training set shuffling'
    )

    args = parser.parse_args()

    data, train_set_metadata = build_dataset(
        args.dataset_csv,
        args.train_set_metadata_json,
        args.features,
        args.preprocessing_parameters,
        args.random_seed
    )

    # write train set metadata, dataset
    logger.info('Writing train set metadata with vocabulary')
    data_utils.save_json(args.output_metadata_json, train_set_metadata)
    logger.info('Writing dataset')
    data_utils.save_hdf5(args.output_dataset_h5, data, train_set_metadata)
Beispiel #29
0
def _preprocess_csv_for_training(
        features,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed
):
    """
    Method to pre-process csv data
    :param features: list of all features (input + output)
    :param data_csv: path to the csv data
    :param data_train_csv:  training csv data
    :param data_validation_csv: validation csv data
    :param data_test_csv: test csv data
    :param train_set_metadata_json: train set metadata json
    :param skip_save_processed_input: if False, the pre-processed data is saved
    as .hdf5 files in the same location as the csvs with the same names.
    :param preprocessing_params: preprocessing parameters
    :param random_seed: random seed
    :return: training, test, validation datasets, training metadata
    """
    train_set_metadata = None
    if train_set_metadata_json is not None:
        train_set_metadata = load_metadata(train_set_metadata_json)

    if data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logger.info(
            'Using full raw csv, no hdf5 and json file '
            'with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(
            data_csv,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logger.info('Writing train set metadata with vocabulary')

            train_set_metadata_json_fp = replace_file_extension(
                data_csv,
                'json'
            )
            data_utils.save_json(
                train_set_metadata_json_fp, train_set_metadata)

        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logger.info(
            'Using training raw csv, no hdf5 and json '
            'file with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(
            data_train_csv,
            data_validation_csv,
            data_test_csv
        )
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(
            concatenated_df,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
            data_utils.save_hdf5(
                data_train_hdf5_fp,
                training_set,
                train_set_metadata
            )
            if validation_set is not None:
                data_validation_hdf5_fp = replace_file_extension(
                    data_validation_csv,
                    'hdf5'
                )
                data_utils.save_hdf5(
                    data_validation_hdf5_fp,
                    validation_set,
                    train_set_metadata
                )
            if test_set is not None:
                data_test_hdf5_fp = replace_file_extension(data_test_csv,
                                                           'hdf5')
                data_utils.save_hdf5(
                    data_test_hdf5_fp,
                    test_set,
                    train_set_metadata
                )
            logger.info('Writing train set metadata with vocabulary')
            train_set_metadata_json_fp = replace_file_extension(data_train_csv,
                                                                'json')
            data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    return training_set, test_set, validation_set, train_set_metadata
Beispiel #30
0
def save_evaluation_stats(test_stats, output_directory):
    test_stats_fn = os.path.join(output_directory, "test_statistics.json")
    save_json(test_stats_fn, test_stats)