Ejemplo n.º 1
0
def collect_weights(model_path,
                    tensors,
                    output_directory='results',
                    debug=False,
                    **kwargs):
    # setup directories and file names
    experiment_dir_name = find_non_existing_dir_by_adding_suffix(
        output_directory)

    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(experiment_dir_name))
    logger.info('\n')

    model, model_definition = load_model_and_definition(model_path)

    # collect weights
    print_boxed('COLLECT WEIGHTS')
    collected_tensors = model.collect_weights(tensors)
    model.close_session()

    # saving
    os.makedirs(experiment_dir_name)
    save_tensors(collected_tensors, experiment_dir_name)

    logger.info('Saved to: {0}'.format(experiment_dir_name))
Ejemplo n.º 2
0
def predict(dataset,
            train_set_metadata,
            model,
            model_definition,
            batch_size=128,
            evaluate_performance=True,
            debug=False):
    """Computes predictions based on the computed model.
        :param dataset: Dataset containing the data to calculate
               the predictions from.
        :type dataset: Dataset
        :param model: The trained model used to produce the predictions.
        :type model: Model
        :param model_definition: The model definition of the model to use
               for obtaining predictions
        :type model_definition: Dictionary
        :param batch_size: The size of batches when computing the predictions.
        :type batch_size: Integer
        :param evaluate_performance: If this parameter is False, only the predictions
               will be returned, if it is True, also performance metrics
               will be calculated on the predictions. It requires the data
               to contain also ground truth for the output features, otherwise
               the metrics cannot be computed.
        :type evaluate_performance: Bool
        :param debug: If true turns on tfdbg with inf_or_nan checks.
        :type debug: Boolean

        :returns: A dictionary containing the predictions of each output feature,
                  alongside with statistics on the quality of those predictions
                  (if evaluate_performance is True).
        """
    if is_on_master():
        print_boxed('PREDICT')

    test_stats, test_predictions = model.predict(
        dataset, batch_size, evaluate_performance=evaluate_performance)

    if not test_stats:
        test_stats = {}

    # combine predictions with the overall metrics
    for of_name in test_predictions:
        # remove logits, not needed for overall stats
        if LOGITS in test_predictions[of_name]:
            del test_predictions[of_name][LOGITS]

        if of_name not in test_stats:
            test_stats[of_name] = {}

        test_stats[of_name] = {
            **test_stats[of_name],
            **test_predictions[of_name]
        }

    if evaluate_performance:
        calculate_overall_stats(test_stats,
                                model_definition['output_features'], dataset,
                                train_set_metadata)

    return test_stats
Ejemplo n.º 3
0
def collect_weights(model_path,
                    tensors,
                    output_directory='results',
                    debug=False,
                    **kwargs):
    # setup directories and file names
    experiment_dir_name = output_directory
    suffix = 0
    while os.path.exists(experiment_dir_name):
        experiment_dir_name = output_directory + '_' + str(suffix)
        suffix += 1

    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(experiment_dir_name))
    logger.info('\n')

    model, model_definition = load_model_and_definition(model_path)

    # collect weights
    print_boxed('COLLECT WEIGHTS')
    collected_tensors = model.collect_weights(tensors)
    model.close_session()

    # saving
    os.mkdir(experiment_dir_name)
    save_tensors(collected_tensors, experiment_dir_name)

    logger.info('Saved to: {0}'.format(experiment_dir_name))
Ejemplo n.º 4
0
def collect_weights(model_path: str, tensors: List[str], output_directory: str = "results", **kwargs) -> List[str]:
    """Loads a pretrained model and collects weights.

    # Inputs
    :param model_path: (str) filepath to pre-trained model.
    :param tensors: (list, default: `None`) List of tensor names to collect
        weights
    :param output_directory: (str, default: `'results'`) the directory where
        collected weights will be stored.

    # Return

    :return: (List[str]) list of filepath to `*.npy` files containing
        the weights.
    """
    logger.info(f"Model path: {model_path}")
    logger.info(f"Output path: {output_directory}")
    logger.info("\n")

    model = LudwigModel.load(model_path)

    # collect weights
    print_boxed("COLLECT WEIGHTS")
    collected_tensors = model.collect_weights(tensors)

    # saving
    os.makedirs(output_directory, exist_ok=True)
    saved_filenames = save_tensors(collected_tensors, output_directory)

    logger.info(f"Saved to: {output_directory}")
    return saved_filenames
Ejemplo n.º 5
0
def print_hyperopt_results(hyperopt_results: HyperoptResults):
    print_boxed("HYPEROPT RESULTS", print_fun=logger.info)
    for trial_results in hyperopt_results.ordered_trials:
        logger.info(
            f"score: {trial_results.metric_score:.6f} | parameters: {trial_results.parameters}"
        )
    logger.info("")
Ejemplo n.º 6
0
def collect_activations(model_path,
                        layers,
                        dataset,
                        data_format=None,
                        batch_size=128,
                        output_directory='results',
                        gpus=None,
                        gpu_memory_limit=None,
                        allow_parallel_threads=True,
                        use_horovod=None,
                        debug=False,
                        **kwargs):
    """Uses the pretrained model to collect the tensors corresponding to a
    datapoint in the dataset. Saves the tensors to the experiment directory

    :param model_path: Is the model from which the tensors will be collected
    :param layers: List of layer names we wish to collect the output from
    :param data_csv: The CSV filepath which contains the datapoints from which
           the tensors are collected
    :param data_hdf5: The HDF5 file path if the CSV file path does not exist,
           an alternative source of providing the data to the model
    :param split: Split type
    :param batch_size: Batch size
    :param output_directory: Output directory
    :param gpus: The total number of GPUs that the model intends to use
    :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate
           per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use
           multithreading parallelism to improve performance at the cost of
           determinism.
    :param debug: To step through the stack traces and find possible errors
    :returns: None

    """
    logger.info('Dataset path: {}'.format(dataset))
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(output_directory))
    logger.info('\n')

    model = LudwigModel.load(model_path,
                             gpus=gpus,
                             gpu_memory_limit=gpu_memory_limit,
                             allow_parallel_threads=allow_parallel_threads,
                             use_horovod=use_horovod)

    # collect activations
    print_boxed('COLLECT ACTIVATIONS')
    collected_tensors = model.collect_activations(layers,
                                                  dataset,
                                                  data_format=data_format,
                                                  batch_size=batch_size,
                                                  debug=debug)

    # saving
    os.makedirs(output_directory, exist_ok=True)
    saved_filenames = save_tensors(collected_tensors, output_directory)

    logger.info('Saved to: {0}'.format(output_directory))
    return saved_filenames
Ejemplo n.º 7
0
def predict(
        dataset,
        train_set_metadata,
        model,
        model_definition,
        batch_size=128,
        only_predictions=False,
        gpus=None,
        gpu_fraction=1.0,
        debug=False
):
    """Computes predictions based on the computed model.
        :param dataset: Dataset contaning the data to calculate
               the predictions from.
        :type dataset: Dataset
        :param model: The trained model used to produce the predictions.
        :type model: Model
        :param model_definition: The model definition of the model to use
               for obtaining predictions
        :type model_definition: Dictionary
        :param batch_size: The size of batches when computing the predictions.
        :type batch_size: Integer
        :param only_predictions: If this parameter is True, only the predictions
               will be returned, if it is False, also performance metrics
               will be calculated on the predictions. It requires the data
               to contanin also ground truth for the output features, otherwise
               the metrics cannot be computed.
        :type only_predictions: Bool
        :type gpus: List
        :type gpu_fraction: Integer
        :param debug: If true turns on tfdbg with inf_or_nan checks.
        :type debug: Boolean

        :returns: A dictionary contaning the predictions of each output feature,
                  alongside with statistics on the quality of those predictions
                  (if only_predictions is False).
        """
    if is_on_master():
        print_boxed('PREDICT')
    test_stats = model.predict(
        dataset,
        batch_size,
        only_predictions=only_predictions,
        gpus=gpus,
        gpu_fraction=gpu_fraction
    )

    if not only_predictions:
        calculate_overall_stats(
            test_stats,
            model_definition['output_features'],
            dataset,
            train_set_metadata
        )

    return test_stats
Ejemplo n.º 8
0
def load(cache_dir=DEFAULT_CACHE_LOCATION,
         kaggle_username=None,
         kaggle_key=None):
    print_boxed("LOADING DATA")
    logging.info("Loading WNMT15 data from Kaggle (this may take a while).")
    dataset = WMT15(cache_dir=cache_dir,
                    kaggle_username=kaggle_username,
                    kaggle_key=kaggle_key)
    loaded_dataset = dataset.load(split=False)
    logging.info("Finished loading.")
    return loaded_dataset
Ejemplo n.º 9
0
def collect_weights(model_path,
                    tensors,
                    output_directory='results',
                    debug=False,
                    **kwargs):
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(output_directory))
    logger.info('\n')

    model = LudwigModel.load(model_path)

    # collect weights
    print_boxed('COLLECT WEIGHTS')
    collected_tensors = model.collect_weights(tensors)

    # saving
    os.makedirs(output_directory, exist_ok=True)
    saved_filenames = save_tensors(collected_tensors, output_directory)

    logger.info('Saved to: {0}'.format(output_directory))
    return saved_filenames
Ejemplo n.º 10
0
def collect_weights(model_path: str,
                    tensors: List[str],
                    output_directory: str = 'results',
                    debug: bool = False,
                    **kwargs) -> List[str]:
    """
    Loads a pretrained model and collects weights.

    # Inputs
    :param model_path: (str) filepath to pre-trained model.
    :param tensors: (list, default: `None`) List of tensor names to collect
        weights
    :param output_directory: (str, default: `'results'`) the directory where
        collected weights will be stored.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.

    # Return

    :return: (List[str]) list of filepath to `*.npy` files containing
        the weights.
    """
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(output_directory))
    logger.info('\n')

    model = LudwigModel.load(model_path)

    # collect weights
    print_boxed('COLLECT WEIGHTS')
    collected_tensors = model.collect_weights(tensors)

    # saving
    os.makedirs(output_directory, exist_ok=True)
    saved_filenames = save_tensors(collected_tensors, output_directory)

    logger.info('Saved to: {0}'.format(output_directory))
    return saved_filenames
Ejemplo n.º 11
0
def print_hyperopt_results(hyperopt_results):
    print_boxed('HYPEROPT RESULTS', print_fun=logger.info)
    for hyperopt_result in hyperopt_results:
        logger.info('score: {:.6f} | parameters: {}'.format(
            hyperopt_result['metric_score'], hyperopt_result['parameters']))
    logger.info("")
Ejemplo n.º 12
0
def collect_activations(model_path,
                        tensors,
                        data_csv=None,
                        data_hdf5=None,
                        split='test',
                        batch_size=128,
                        output_directory='results',
                        gpus=None,
                        gpu_fraction=1.0,
                        debug=False,
                        **kwargs):
    """Uses the pretrained model to collect the tensors corresponding to a
    datapoint in the dataset. Saves the tensors to the experiment directory

    :param model_path: Is the model from which the tensors will be collected
    :param tensors: List contaning the names of the tensors to collect
    :param data_csv: The CSV filepath which contains the datapoints from which
           the tensors are collected
    :param data_hdf5: The HDF5 file path if the CSV file path does not exist,
           an alternative source of providing the data to the model
    :param split: Split type
    :param batch_size: Batch size
    :param output_directory: Output directory
    :param gpus: The total number of GPUs that the model intends to use
    :param gpu_fraction: The fraction of each GPU that the model intends on
           using
    :param debug: To step through the stack traces and find possible errors
    :returns: None

    """
    # setup directories and file names
    experiment_dir_name = output_directory
    suffix = 0
    while os.path.exists(experiment_dir_name):
        experiment_dir_name = output_directory + '_' + str(suffix)
        suffix += 1

    logger.info('Dataset path: {}'.format(
        data_csv if data_csv is not None else data_hdf5))
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(experiment_dir_name))
    logger.info('\n')

    train_set_metadata_fp = os.path.join(model_path,
                                         TRAIN_SET_METADATA_FILE_NAME)

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path, split, data_csv, data_hdf5, train_set_metadata_fp)

    model, model_definition = load_model_and_definition(model_path)

    # collect activations
    print_boxed('COLLECT ACTIVATIONS')
    collected_tensors = model.collect_activations(dataset,
                                                  tensors,
                                                  batch_size,
                                                  gpus=gpus,
                                                  gpu_fraction=gpu_fraction)

    model.close_session()

    # saving
    os.mkdir(experiment_dir_name)
    save_tensors(collected_tensors, experiment_dir_name)

    logger.info('Saved to: {0}'.format(experiment_dir_name))
Ejemplo n.º 13
0
def print_hyperopt_results(hyperopt_results: HyperoptResults):
    print_boxed('HYPEROPT RESULTS', print_fun=logger.info)
    for trial_results in hyperopt_results.ordered_trials:
        logger.info('score: {:.6f} | parameters: {}'.format(
            trial_results.metric_score, trial_results.parameters))
    logger.info("")
Ejemplo n.º 14
0
def collect_activations(model_path: str,
                        layers: List[str],
                        dataset: str,
                        data_format: str = None,
                        split: str = FULL,
                        batch_size: int = 128,
                        output_directory: str = 'results',
                        gpus: List[str] = None,
                        gpu_memory_limit: int = None,
                        allow_parallel_threads: bool = True,
                        backend: Union[Backend, str] = None,
                        debug: bool = False,
                        **kwargs) -> List[str]:
    """
    Uses the pretrained model to collect the tensors corresponding to a
    datapoint in the dataset. Saves the tensors to the experiment directory

    # Inputs

    :param model_path: (str) filepath to pre-trained model.
    :param layers: (List[str]) list of strings for layer names in the model
        to collect activations.
    :param dataset: (str) source
        containing the data to make predictions.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param split: (str, default: `full`) split on which
        to perform predictions. Valid values are `'training'`, `'validation'`,
        `'test'` and `'full'`.
    :param batch_size: (int, default `128`) size of batches for processing.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.

    # Return

    :return: (List[str]) list of filepath to `*.npy` files containing
        the activations.
    """
    logger.info('Dataset path: {}'.format(dataset))
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(output_directory))
    logger.info('\n')

    model = LudwigModel.load(model_path,
                             gpus=gpus,
                             gpu_memory_limit=gpu_memory_limit,
                             allow_parallel_threads=allow_parallel_threads,
                             backend=backend)

    # collect activations
    print_boxed('COLLECT ACTIVATIONS')
    collected_tensors = model.collect_activations(layers,
                                                  dataset,
                                                  data_format=data_format,
                                                  split=split,
                                                  batch_size=batch_size,
                                                  debug=debug)

    # saving
    os.makedirs(output_directory, exist_ok=True)
    saved_filenames = save_tensors(collected_tensors, output_directory)

    logger.info('Saved to: {0}'.format(output_directory))
    return saved_filenames
Ejemplo n.º 15
0
def collect_activations(
        model_path,
        tensors,
        data_csv=None,
        data_hdf5=None,
        split=TEST,
        batch_size=128,
        output_directory='results',
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        debug=False,
        **kwargs
):
    """Uses the pretrained model to collect the tensors corresponding to a
    datapoint in the dataset. Saves the tensors to the experiment directory

    :param model_path: Is the model from which the tensors will be collected
    :param tensors: List contaning the names of the tensors to collect
    :param data_csv: The CSV filepath which contains the datapoints from which
           the tensors are collected
    :param data_hdf5: The HDF5 file path if the CSV file path does not exist,
           an alternative source of providing the data to the model
    :param split: Split type
    :param batch_size: Batch size
    :param output_directory: Output directory
    :param gpus: The total number of GPUs that the model intends to use
    :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate
           per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use
           multithreading parallelism to improve performance at the cost of
           determinism.
    :param debug: To step through the stack traces and find possible errors
    :returns: None

    """
    # setup directories and file names
    experiment_dir_name = find_non_existing_dir_by_adding_suffix(output_directory)

    logger.info('Dataset path: {}'.format(
        data_csv if data_csv is not None else data_hdf5)
    )
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(experiment_dir_name))
    logger.info('\n')

    train_set_metadata_fp = os.path.join(
        model_path,
        TRAIN_SET_METADATA_FILE_NAME
    )

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path,
        split,
        data_csv,
        data_hdf5,
        train_set_metadata_fp
    )

    model, model_definition = load_model_and_definition(model_path,
                                                        gpus=gpus,
                                                        gpu_memory_limit=gpu_memory_limit,
                                                        allow_parallel_threads=allow_parallel_threads)

    # collect activations
    print_boxed('COLLECT ACTIVATIONS')
    collected_tensors = model.collect_activations(
        dataset,
        tensors,
        batch_size
    )

    # saving
    os.makedirs(experiment_dir_name)
    save_tensors(collected_tensors, experiment_dir_name)

    logger.info('Saved to: {0}'.format(experiment_dir_name))
Ejemplo n.º 16
0
def full_predict(model_path,
                 data_csv=None,
                 data_hdf5=None,
                 split=TEST,
                 batch_size=128,
                 skip_save_unprocessed_output=False,
                 skip_save_test_predictions=False,
                 skip_save_test_statistics=False,
                 output_directory='results',
                 evaluate_performance=True,
                 gpus=None,
                 gpu_fraction=1.0,
                 use_horovod=False,
                 debug=False,
                 **kwargs):
    if is_on_master():
        logger.info('Dataset path: {}'.format(
            data_csv if data_csv is not None else data_hdf5))
        logger.info('Model path: {}'.format(model_path))
        logger.info('')

    train_set_metadata_json_fp = os.path.join(model_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path, split, data_csv, data_hdf5, train_set_metadata_json_fp,
        evaluate_performance)

    # run the prediction
    if is_on_master():
        print_boxed('LOADING MODEL')
    model, model_definition = load_model_and_definition(
        model_path, use_horovod=use_horovod)

    prediction_results = predict(dataset, train_set_metadata, model,
                                 model_definition, batch_size,
                                 evaluate_performance, gpus, gpu_fraction,
                                 debug)
    model.close_session()

    if is_on_master():
        # setup directories and file names
        experiment_dir_name = find_non_existing_dir_by_adding_suffix(
            output_directory)

        # if we are skipping all saving,
        # there is no need to create a directory that will remain empty
        should_create_exp_dir = not (skip_save_unprocessed_output
                                     and skip_save_test_predictions
                                     and skip_save_test_statistics)
        if should_create_exp_dir:
            os.makedirs(experiment_dir_name)

        # postprocess
        postprocessed_output = postprocess(
            prediction_results, model_definition['output_features'],
            train_set_metadata, experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master())

        if not skip_save_test_predictions:
            save_prediction_outputs(postprocessed_output, experiment_dir_name)

        if evaluate_performance:
            print_test_results(prediction_results)
            if not skip_save_test_statistics:
                save_test_statistics(prediction_results, experiment_dir_name)

        logger.info('Saved to: {0}'.format(experiment_dir_name))
Ejemplo n.º 17
0
    def train(
            self,
            dataset=None,
            training_set=None,
            validation_set=None,
            test_set=None,
            training_set_metadata=None,
            data_format=None,
            experiment_name='api_experiment',
            model_name='run',
            model_resume_path=None,
            skip_save_training_description=False,
            skip_save_training_statistics=False,
            skip_save_model=False,
            skip_save_progress=False,
            skip_save_log=False,
            skip_save_processed_input=False,
            output_directory='results',
            random_seed=default_random_seed,
            debug=False,
            **kwargs
    ):
        """This function is used to perform a full training of the model on the
           specified dataset.

        # Inputs

        :param dataset: (string, dict, DataFrame) source containing the entire dataset.
               If it has a split column, it will be used for splitting (0: train,
               1: validation, 2: test), otherwise the dataset will be randomly split.
        :param training_set: (string, dict, DataFrame) source containing training data.
        :param validation_set: (string, dict, DataFrame) source containing validation data.
        :param test_set: (string, dict, DataFrame) source containing test data.
        :param training_set_metadata: (string, dict) metadata JSON file or loaded metadata.
               Intermediate preprocess structure containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a '.json' extension.
        :param data_format: (string) format to interpret data sources. Will be inferred
               automatically if not specified.
        :param experiment_name: (string) a name for the experiment, used for the save
               directory
        :param model_name: (string) a name for the model, used for the save
               directory
        :param model_resume_path: (string) path of a the model directory to
               resume training of
        :param skip_save_training_description: (bool, default: `False`) disables
               saving the description JSON file.
        :param skip_save_training_statistics: (bool, default: `False`) disables
               saving training statistics JSON file.
        :param skip_save_model: (bool, default: `False`) disables
               saving model weights and hyperparameters each time the model
               improves. By default Ludwig saves model weights after each epoch
               the validation metric imrpvoes, but if the model is really big
               that can be time consuming if you do not want to keep
               the weights and just find out what performance can a model get
               with a set of hyperparameters, use this parameter to skip it,
               but the model will not be loadable later on.
        :param skip_save_progress: (bool, default: `False`) disables saving
               progress each epoch. By default Ludwig saves weights and stats
               after each epoch for enabling resuming of training, but if
               the model is really big that can be time consuming and will uses
               twice as much space, use this parameter to skip it, but training
               cannot be resumed later on.
        :param skip_save_log: (bool, default: `False`) disables saving TensorBoard
               logs. By default Ludwig saves logs for the TensorBoard, but if it
               is not needed turning it off can slightly increase the
               overall speed.
        :param skip_save_processed_input: (bool, default: `False`) skips saving
               intermediate HDF5 and JSON files
        :param output_directory: (string, default: `'results'`) directory that
               contains the results
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode

        There are three ways to provide data: by dataframes using the `_df`
        parameters, by CSV using the `_csv` parameters and by HDF5 and JSON,
        using `_hdf5` and `_json` parameters.
        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while HDF5 and
        JSON load previously preprocessed HDF5 and JSON files (they are saved in
        the same directory of the CSV they are obtained from).
        For all three approaches either a full dataset can be provided (which
        will be split randomly according to the split probabilities defined in
        the model definition, by default 70% training, 10% validation and 20%
        test) or, if it contanins a plit column, it will be plit according to
        that column (interpreting 0 as training, 1 as validation and 2 as test).
        Alternatively separated dataframes / CSV / HDF5 files can beprovided
        for each split.

        During training the model and statistics will be saved in a directory
        `[output_dir]/[experiment_name]_[model_name]_n` where all variables are
        resolved to user spiecified ones and `n` is an increasing number
        starting from 0 used to differentiate different runs.


        # Return

        :return: ((dict, DataFrame)) tuple containing:
            - A dictionary of training statistics for each output feature containing
              loss and metrics values for each epoch. The second return
            - A Pandas DataFrame of preprocessed training data.
        """
        # setup directories and file names
        if model_resume_path is not None:
            if os.path.exists(model_resume_path):
                output_directory = model_resume_path
            else:
                if is_on_master():
                    logger.info(
                        'Model resume path does not exists, '
                        'starting training from scratch'
                    )
                model_resume_path = None

        if model_resume_path is None:
            if is_on_master():
                output_directory = get_output_directory(
                    output_directory,
                    experiment_name,
                    model_name
                )
            else:
                output_directory = None

        # if we are skipping all saving,
        # there is no need to create a directory that will remain empty
        should_create_output_directory = not (
                skip_save_training_description and
                skip_save_training_statistics and
                skip_save_model and
                skip_save_progress and
                skip_save_log and
                skip_save_processed_input
        )

        description_fn = training_stats_fn = model_dir = None
        if is_on_master():
            if should_create_output_directory:
                if not os.path.exists(output_directory):
                    os.makedirs(output_directory, exist_ok=True)
            description_fn, training_stats_fn, model_dir = get_file_names(
                output_directory)

        # save description
        if is_on_master():
            description = get_experiment_description(
                self.model_definition,
                dataset=dataset,
                training_set=training_set,
                validation_set=validation_set,
                test_set=test_set,
                training_set_metadata=training_set_metadata,
                data_format=data_format,
                random_seed=random_seed
            )
            if not skip_save_training_description:
                save_json(description_fn, description)
            # print description
            logger.info('Experiment name: {}'.format(experiment_name))
            logger.info('Model name: {}'.format(model_name))
            logger.info('Output directory: {}'.format(output_directory))
            logger.info('\n')
            for key, value in description.items():
                logger.info('{}: {}'.format(key, pformat(value, indent=4)))
            logger.info('\n')

        # preprocess
        preprocessed_data = preprocess_for_training(
            self.model_definition,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=skip_save_processed_input,
            preprocessing_params=self.model_definition[PREPROCESSING],
            random_seed=random_seed
        )

        (training_set,
         validation_set,
         test_set,
         training_set_metadata) = preprocessed_data
        self.training_set_metadata = training_set_metadata

        if is_on_master():
            logger.info('Training set: {0}'.format(training_set.size))
            if validation_set is not None:
                logger.info('Validation set: {0}'.format(validation_set.size))
            if test_set is not None:
                logger.info('Test set: {0}'.format(test_set.size))

        if is_on_master():
            if not skip_save_model:
                # save train set metadata
                os.makedirs(model_dir, exist_ok=True)
                save_json(
                    os.path.join(
                        model_dir,
                        TRAIN_SET_METADATA_FILE_NAME
                    ),
                    training_set_metadata
                )

        contrib_command("train_init", experiment_directory=output_directory,
                        experiment_name=experiment_name, model_name=model_name,
                        output_directory=output_directory,
                        resume=model_resume_path is not None)

        # Build model if not provided
        # if it was provided it means it was already loaded
        if not self.model:
            if is_on_master():
                print_boxed('MODEL', print_fun=logger.debug)
            # update model definition with metadata properties
            update_model_definition_with_metadata(
                self.model_definition,
                training_set_metadata
            )
            self.model = LudwigModel.create_model(self.model_definition,
                                                  random_seed=random_seed)

        # init trainer
        trainer = Trainer(
            **self.model_definition[TRAINING],
            resume=model_resume_path is not None,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            random_seed=random_seed,
            horoovd=self._horovod,
            debug=debug
        )

        contrib_command("train_model", self.model, self.model_definition,
                        self.model_definition_fp)

        # train model
        if is_on_master():
            print_boxed('TRAINING')
            if not skip_save_model:
                self.save_model_definition(model_dir)

        train_stats = trainer.train(
            self.model,
            training_set,
            validation_set=validation_set,
            test_set=test_set,
            save_path=model_dir,
        )

        train_trainset_stats, train_valiset_stats, train_testset_stats = train_stats
        train_stats = {
            TRAINING: train_trainset_stats,
            VALIDATION: train_valiset_stats,
            TEST: train_testset_stats
        }

        # save training statistics
        if is_on_master():
            if not skip_save_training_statistics:
                save_json(training_stats_fn, train_stats)

        # grab the results of the model with highest validation test performance
        validation_field = trainer.validation_field
        validation_metric = trainer.validation_metric
        validation_field_result = train_valiset_stats[validation_field]

        best_function = get_best_function(validation_metric)
        # results of the model with highest validation test performance
        if is_on_master() and validation_set is not None:
            epoch_best_vali_metric, best_vali_metric = best_function(
                enumerate(validation_field_result[validation_metric]),
                key=lambda pair: pair[1]
            )
            logger.info(
                'Best validation model epoch: {0}'.format(
                    epoch_best_vali_metric + 1)
            )
            logger.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_metric, validation_field, best_vali_metric
                ))
            if test_set is not None:
                best_vali_metric_epoch_test_metric = train_testset_stats[
                    validation_field][validation_metric][
                    epoch_best_vali_metric]

                logger.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_metric,
                        validation_field,
                        best_vali_metric_epoch_test_metric
                    )
                )
            logger.info(
                '\nFinished: {0}_{1}'.format(experiment_name, model_name))
            logger.info('Saved to: {0}'.format(output_directory))

        contrib_command("train_save", output_directory)

        self.training_set_metadata = training_set_metadata

        if not skip_save_model:
            # Load the best weights from saved checkpoint
            self.load_weights(model_dir)

        return train_stats, preprocessed_data, output_directory
Ejemplo n.º 18
0
def train(training_set,
          validation_set,
          test_set,
          model_definition,
          save_path='model',
          model_load_path=None,
          resume=False,
          skip_save_progress_weights=False,
          gpus=None,
          gpu_fraction=1.0,
          use_horovod=False,
          random_seed=default_random_seed,
          debug=False):
    """
    :param training_set: Dataset contaning training data
    :type training_set: Dataset
    :param validation_set: Dataset contaning validation data
    :type validation_set: Datasetk
    :param test_set: Dataset contaning test data.
    :type test_set: Dataset
    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param save_path: The path to save the model to.
    :type save_path: filepath (str)
    :param model_load_path: If this is specified the loaded model will be used
           as initialization (useful for transfer learning).
    :type model_load_path: filepath (str)
    :param skip_save_progress_weights: Skips saving the weights at the end of
           each epoch. If this is true, training cannot be resumed from the
           exactly the state at the end of the previous epoch.
    :type skip_save_progress_weights: Boolean
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_fraction: Fraction of the memory of each GPU to use at
           the beginning of the training. The memory may grow elastically.
    :type gpu_fraction: Integer
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean
    :returns: None
    """
    if model_load_path is not None:
        # Load model
        if is_on_master():
            print_boxed('LOADING MODEL')
            logging.info('Loading model: {}\n'.format(model_load_path))
        model, _ = load_model_and_definition(model_load_path)
    else:
        # Build model
        if is_on_master():
            print_boxed('BUILDING MODEL', print_fun=logging.debug)
        model = Model(model_definition['input_features'],
                      model_definition['output_features'],
                      model_definition['combiner'],
                      model_definition['training'],
                      model_definition['preprocessing'],
                      use_horovod=use_horovod,
                      random_seed=random_seed,
                      debug=debug)

    # Train model
    if is_on_master():
        print_boxed('TRAINING')
    return model, model.train(
        training_set,
        validation_set=validation_set,
        test_set=test_set,
        save_path=save_path,
        resume=resume,
        skip_save_progress_weights=skip_save_progress_weights,
        gpus=gpus,
        gpu_fraction=gpu_fraction,
        random_seed=random_seed,
        **model_definition['training'])
Ejemplo n.º 19
0
def train(training_set,
          validation_set,
          test_set,
          model_definition,
          save_path='model',
          model_load_path=None,
          resume=False,
          skip_save_model=False,
          skip_save_progress=False,
          skip_save_log=False,
          gpus=None,
          gpu_fraction=1.0,
          use_horovod=False,
          random_seed=default_random_seed,
          debug=False):
    """
    :param training_set: Dataset contaning training data
    :type training_set: Dataset
    :param validation_set: Dataset contaning validation data
    :type validation_set: Datasetk
    :param test_set: Dataset contaning test data.
    :type test_set: Dataset
    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param save_path: The path to save the model to.
    :type save_path: filepath (str)
    :param model_load_path: If this is specified the loaded model will be used
           as initialization (useful for transfer learning).
    :type model_load_path: filepath (str)
    :param skip_save_model: Disables
               saving model weights and hyperparameters each time the model
           improves. By default Ludwig saves model weights after each epoch
           the validation measure imrpvoes, but if the model is really big
           that can be time consuming if you do not want to keep
           the weights and just find out what performance can a model get
           with a set of hyperparameters, use this parameter to skip it,
           but the model will not be loadable later on.
    :type skip_save_model: Boolean
    :param skip_save_progress: Disables saving
           progress each epoch. By default Ludwig saves weights and stats
           after each epoch for enabling resuming of training, but if
           the model is really big that can be time consuming and will uses
           twice as much space, use this parameter to skip it, but training
           cannot be resumed later on.
    :type skip_save_progress: Boolean
    :param skip_save_log: Disables saving TensorBoard
           logs. By default Ludwig saves logs for the TensorBoard, but if it
           is not needed turning it off can slightly increase the
           overall speed..
    :type skip_save_log: Boolean
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_fraction: Fraction of the memory of each GPU to use at
           the beginning of the training. The memory may grow elastically.
    :type gpu_fraction: Integer
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean
    :returns: None
    """
    if model_load_path is not None:
        # Load model
        if is_on_master():
            print_boxed('LOADING MODEL')
            logger.info('Loading model: {}\n'.format(model_load_path))
        model, _ = load_model_and_definition(model_load_path)
    else:
        # Build model
        if is_on_master():
            print_boxed('BUILDING MODEL', print_fun=logger.debug)

        model = Model(model_definition['input_features'],
                      model_definition['output_features'],
                      model_definition['combiner'],
                      model_definition['training'],
                      model_definition['preprocessing'],
                      use_horovod=use_horovod,
                      random_seed=random_seed,
                      debug=debug)

    contrib_command("train_model", model, model_definition, model_load_path)

    # Train model
    if is_on_master():
        print_boxed('TRAINING')
    return model, model.train(training_set,
                              validation_set=validation_set,
                              test_set=test_set,
                              save_path=save_path,
                              resume=resume,
                              skip_save_model=skip_save_model,
                              skip_save_progress=skip_save_progress,
                              skip_save_log=skip_save_log,
                              gpus=gpus,
                              gpu_fraction=gpu_fraction,
                              random_seed=random_seed,
                              **model_definition['training'])
Ejemplo n.º 20
0
def full_predict(model_path,
                 data_csv=None,
                 data_hdf5=None,
                 split='test',
                 batch_size=128,
                 skip_save_unprocessed_output=False,
                 output_directory='results',
                 evaluate_performance=True,
                 gpus=None,
                 gpu_fraction=1.0,
                 use_horovod=False,
                 debug=False,
                 **kwargs):
    # setup directories and file names
    experiment_dir_name = output_directory
    suffix = 0
    while os.path.exists(experiment_dir_name):
        experiment_dir_name = output_directory + '_' + str(suffix)
        suffix += 1

    if is_on_master():
        logging.info('Dataset path: {}'.format(
            data_csv if data_csv is not None else data_hdf5))
        logging.info('Model path: {}'.format(model_path))
        logging.info('Output path: {}'.format(experiment_dir_name))
        logging.info('')

    train_set_metadata_json_fp = os.path.join(model_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path, split, data_csv, data_hdf5, train_set_metadata_json_fp,
        evaluate_performance)

    # run the prediction
    if is_on_master():
        print_boxed('LOADING MODEL')
    model, model_definition = load_model_and_definition(
        model_path, use_horovod=use_horovod)

    prediction_results = predict(dataset, train_set_metadata, model,
                                 model_definition, batch_size,
                                 evaluate_performance, gpus, gpu_fraction,
                                 debug)
    model.close_session()

    if is_on_master():
        os.mkdir(experiment_dir_name)

        # postprocess
        postprocessed_output = postprocess(
            prediction_results, model_definition['output_features'],
            train_set_metadata, experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master())

        save_prediction_outputs(postprocessed_output, experiment_dir_name)

        if evaluate_performance:
            print_prediction_results(prediction_results)
            save_prediction_statistics(prediction_results, experiment_dir_name)

        logging.info('Saved to: {0}'.format(experiment_dir_name))