Ejemplo n.º 1
0
def test_remote_training_set(tmpdir, fs_protocol):
    with tempfile.TemporaryDirectory() as outdir:
        output_directory = f"{fs_protocol}://{outdir}"

        input_features = [sequence_feature(reduce_output="sum")]
        output_features = [category_feature(vocab_size=2, reduce_input="sum")]

        csv_filename = os.path.join(tmpdir, "training.csv")
        data_csv = generate_data(input_features, output_features, csv_filename)
        val_csv = shutil.copyfile(data_csv,
                                  os.path.join(tmpdir, "validation.csv"))
        test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

        data_csv = f"{fs_protocol}://{os.path.abspath(data_csv)}"
        val_csv = f"{fs_protocol}://{os.path.abspath(val_csv)}"
        test_csv = f"{fs_protocol}://{os.path.abspath(test_csv)}"

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "fc_size": 14
            },
            "training": {
                "epochs": 2
            },
        }

        config_path = os.path.join(tmpdir, "config.yaml")
        with open(config_path, "w") as f:
            yaml.dump(config, f)
        config_path = f"{fs_protocol}://{config_path}"

        backend_config = {
            "type": "local",
        }
        backend = initialize_backend(backend_config)

        model = LudwigModel(config_path, backend=backend)
        _, _, output_directory = model.train(training_set=data_csv,
                                             validation_set=val_csv,
                                             test_set=test_csv,
                                             output_directory=output_directory)
        model.predict(dataset=test_csv, output_directory=output_directory)

        # Train again, this time the cache will be used
        # Resume from the remote output directory
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv,
                    model_resume_path=output_directory)
Ejemplo n.º 2
0
def test_remote_training_set(tmpdir, fs_protocol, cache_format):
    with tempfile.TemporaryDirectory() as outdir:
        output_directory = f'{fs_protocol}://{outdir}'

        input_features = [sequence_feature(reduce_output='sum')]
        output_features = [category_feature(vocab_size=2, reduce_input='sum')]

        csv_filename = os.path.join(tmpdir, 'training.csv')
        data_csv = generate_data(input_features, output_features, csv_filename)
        val_csv = shutil.copyfile(data_csv,
                                  os.path.join(tmpdir, 'validation.csv'))
        test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv'))

        data_csv = f'{fs_protocol}://{os.path.abspath(data_csv)}'
        val_csv = f'{fs_protocol}://{os.path.abspath(val_csv)}'
        test_csv = f'{fs_protocol}://{os.path.abspath(test_csv)}'

        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {'type': 'concat', 'fc_size': 14},
            'training': {'epochs': 2},
        }

        config_path = os.path.join(tmpdir, 'config.yaml')
        with open(config_path, 'w') as f:
            yaml.dump(config, f)
        config_path = f'{fs_protocol}://{config_path}'

        backend_config = {
            'type': 'local',
            'cache_format': cache_format
        }
        backend = initialize_backend(backend_config)

        model = LudwigModel(config_path, backend=backend)
        _, _, output_directory = model.train(
            training_set=data_csv,
            validation_set=val_csv,
            test_set=test_csv,
            output_directory=output_directory
        )
        model.predict(dataset=test_csv,
                      output_directory=output_directory)

        # Train again, this time the cache will be used
        # Resume from the remote output directory
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv,
                    model_resume_path=output_directory)
Ejemplo n.º 3
0
def test_ray_read_binary_files(tmpdir, df_engine):
    preprocessing_params = {
        "audio_file_length_limit_in_s": 3.0,
        "missing_value_strategy": BACKFILL,
        "in_memory": True,
        "padding_value": 0,
        "norm": "per_file",
        "audio_feature": {
            "type": "fbank",
            "window_length_in_s": 0.04,
            "window_shift_in_s": 0.02,
            "num_filter_bands": 80,
        },
    }
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")
    audio_params = audio_feature(folder=audio_dest_folder, preprocessing=preprocessing_params)

    dataset_path = os.path.join(tmpdir, "dataset.csv")
    dataset_path = generate_data([audio_params], [], dataset_path, num_examples=100)
    dataset_path = create_data_set_to_use("csv", dataset_path, nan_percent=0.1)

    with ray_start(num_cpus=2, num_gpus=None):
        backend_config = {**RAY_BACKEND_CONFIG}
        backend_config["processor"]["type"] = df_engine
        backend = initialize_backend(backend_config)
        df = backend.df_engine.df_lib.read_csv(dataset_path)
        series = df[audio_params[COLUMN]]
        proc_col = backend.read_binary_files(series)
        proc_col = backend.df_engine.compute(proc_col)

        backend = initialize_backend(LOCAL_BACKEND)
        df = backend.df_engine.df_lib.read_csv(dataset_path)
        series = df[audio_params[COLUMN]]
        proc_col_expected = backend.read_binary_files(series)

        assert proc_col.equals(proc_col_expected)
Ejemplo n.º 4
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description="This script searches for optimal Hyperparameters",
        prog="ludwig hyperopt",
        usage="%(prog)s [options]",
    )

    # -------------------
    # Hyperopt parameters
    # -------------------
    parser.add_argument(
        "-sshs",
        "--skip_save_hyperopt_statistics",
        help="skips saving hyperopt statistics file",
        action="store_true",
        default=False,
    )

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument(
        "--output_directory",
        type=str,
        default="results",
        help="directory that contains the results",
    )
    parser.add_argument("--experiment_name",
                        type=str,
                        default="hyperopt",
                        help="experiment name")
    parser.add_argument("--model_name",
                        type=str,
                        default="run",
                        help="name for the model")

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        "--dataset",
        help="input data file path. "
        "If it has a split column, it will be used for splitting "
        "(0: train, 1: validation, 2: test), "
        "otherwise the dataset will be randomly split",
    )
    parser.add_argument("--training_set", help="input train data file path")
    parser.add_argument("--validation_set",
                        help="input validation data file path")
    parser.add_argument("--test_set", help="input test data file path")

    parser.add_argument(
        "--training_set_metadata",
        help="input metadata JSON file path. An intermediate preprocessed file "
        "containing the mappings of the input file created "
        "the first time a file is used, in the same directory "
        "with the same name and a .json extension",
    )

    parser.add_argument(
        "--data_format",
        help="format of the input data",
        default="auto",
        choices=[
            "auto",
            "csv",
            "excel",
            "feather",
            "fwf",
            "hdf5",
            "html"
            "tables",
            "json",
            "jsonl",
            "parquet",
            "pickle",
            "sas",
            "spss",
            "stata",
            "tsv",
        ],
    )

    parser.add_argument(
        "-sspi",
        "--skip_save_processed_input",
        help="skips saving intermediate HDF5 and JSON files",
        action="store_true",
        default=False,
    )

    # ----------------
    # Model parameters
    # ----------------
    config = parser.add_mutually_exclusive_group(required=True)
    config.add_argument(
        "-c",
        "--config",
        type=load_yaml,
        help="Path to the YAML file containing the model configuration",
    )
    config.add_argument(
        "-cs",
        "--config_str",
        dest="config",
        type=load_config_from_str,
        help="JSON or YAML serialized string of the model configuration",
    )

    parser.add_argument(
        "-mlp",
        "--model_load_path",
        help="path of a pretrained model to load as initialization",
    )
    parser.add_argument(
        "-mrp",
        "--model_resume_path",
        help="path of the model directory to resume training of",
    )
    parser.add_argument(
        "-sstd",
        "--skip_save_training_description",
        action="store_true",
        default=False,
        help="disables saving the description JSON file",
    )
    parser.add_argument(
        "-ssts",
        "--skip_save_training_statistics",
        action="store_true",
        default=False,
        help="disables saving training statistics JSON file",
    )
    parser.add_argument(
        "-ssm",
        "--skip_save_model",
        action="store_true",
        default=False,
        help="disables saving weights each time the model improves. "
        "By default Ludwig saves  weights after each epoch "
        "the validation metric (improves, but  if the model is really big "
        "that can be time consuming. If you do not want to keep "
        "the weights and just find out what performance a model can get "
        "with a set of hyperparameters, use this parameter to skip it",
    )
    parser.add_argument(
        "-ssp",
        "--skip_save_progress",
        action="store_true",
        default=False,
        help="disables saving weights after each epoch. By default ludwig saves "
        "weights after each epoch for enabling resuming of training, but "
        "if the model is really big that can be time consuming and will "
        "save twice as much space, use this parameter to skip it",
    )
    parser.add_argument(
        "-ssl",
        "--skip_save_log",
        action="store_true",
        default=False,
        help="disables saving TensorBoard logs. By default Ludwig saves "
        "logs for the TensorBoard, but if it is not needed turning it off "
        "can slightly increase the overall speed",
    )

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        "-rs",
        "--random_seed",
        type=int,
        default=42,
        help="a random seed that is going to be used anywhere there is a call "
        "to a random number generator: data splitting, parameter "
        "initialization and training set shuffling",
    )
    parser.add_argument(
        "-hlv",
        "--hyperopt_log_verbosity",
        type=int,
        default=3,
        choices=[0, 1, 2, 3],
        help="Controls verbosity of ray tune log messages.  Valid values: "
        "0 = silent, 1 = only status updates, 2 = status and brief trial "
        "results, 3 = status and detailed trial results.",
    )
    parser.add_argument("-g",
                        "--gpus",
                        nargs="+",
                        type=int,
                        default=None,
                        help="list of gpus to use")
    parser.add_argument("-gml",
                        "--gpu_memory_limit",
                        type=int,
                        default=None,
                        help="maximum memory in MB to allocate per GPU device")
    parser.add_argument(
        "-b",
        "--backend",
        help="specifies backend to use for parallel / distributed execution, "
        "defaults to local execution or Horovod if called using horovodrun",
        choices=ALL_BACKENDS,
    )
    parser.add_argument(
        "-l",
        "--logging_level",
        default="info",
        help="the level of logging to use",
        choices=["critical", "error", "warning", "info", "debug", "notset"],
    )

    add_contrib_callback_args(parser)
    args = parser.parse_args(sys_argv)

    args.callbacks = args.callbacks or []
    for callback in args.callbacks:
        callback.on_cmdline("hyperopt", *sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger("ludwig").setLevel(args.logging_level)
    global logger
    logger = logging.getLogger("ludwig.hyperopt")

    args.backend = initialize_backend(args.backend
                                      or args.config.get("backend"))
    if args.backend.is_coordinator():
        print_ludwig("Hyperopt", LUDWIG_VERSION)

    hyperopt_cli(**vars(args))
Ejemplo n.º 5
0
def experiment_cli(config: dict,
                   config_file: str = None,
                   dataset: Union[str, dict, pd.DataFrame] = None,
                   training_set: Union[str, dict, pd.DataFrame] = None,
                   validation_set: Union[str, dict, pd.DataFrame] = None,
                   test_set: Union[str, dict, pd.DataFrame] = None,
                   training_set_metadata: Union[str, dict] = None,
                   data_format: str = None,
                   experiment_name: str = 'experiment',
                   model_name: str = 'run',
                   model_load_path: str = None,
                   model_resume_path: str = None,
                   eval_split: str = TEST,
                   skip_save_training_description: bool = False,
                   skip_save_training_statistics: bool = False,
                   skip_save_model: bool = False,
                   skip_save_progress: bool = False,
                   skip_save_log: bool = False,
                   skip_save_processed_input: bool = False,
                   skip_save_unprocessed_output: bool = False,
                   skip_save_predictions: bool = False,
                   skip_save_eval_stats: bool = False,
                   skip_collect_predictions: bool = False,
                   skip_collect_overall_stats: bool = False,
                   output_directory: str = 'results',
                   gpus: Union[str, int, List[int]] = None,
                   gpu_memory_limit: int = None,
                   allow_parallel_threads: bool = True,
                   backend: Union[Backend, str] = None,
                   random_seed: int = default_random_seed,
                   debug: bool = False,
                   logging_level: int = logging.INFO,
                   **kwargs):
    """Trains a model on a dataset's training and validation splits and
    uses it to predict on the test split.
    It saves the trained model and the statistics of training and testing.

    # Inputs

    :param config: (dict) config which defines the different
        parameters of the model, features, preprocessing and training.
    :param config_file: (str, default: `None`) the filepath string
        that specifies the config.  It is a yaml file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the experiment.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocessed
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param model_load_path: (str, default: `None`) if this is specified the
        loaded model will be used as initialization
        (useful for transfer learning).
    :param model_resume_path: (str, default: `None`) resumes training of
        the model from the path specified. The config is restored.
        In addition to config, training statistics and loss for
        epoch and the state of the optimizer are restored such that
        training can be effectively continued from a previously interrupted
        training process.
    :param eval_split: (str, default: `test`) split on which
        to perform evaluation. Valid values are `training`, `validation`
        and `test`.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming. If you do not want to keep
        the weights and just find out what performance a model can get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
   :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files
    :param skip_save_eval_stats: (bool, default: `False`) skips saving test
        statistics JSON file
   :param skip_collect_predictions: (bool, default: `False`) skips
        collecting post-processed predictions during eval.
    :param skip_collect_overall_stats: (bool, default: `False`) skips
        collecting overall stats during eval.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.
    :param logging_level: (int) Log level that will be sent to stderr.

    # Return
    :return: (Tuple[LudwigModel, dict, dict, tuple, str)) `(model, evaluation_statistics, training_statistics, preprocessed_data, output_directory)`
        `model` LudwigModel instance
        `evaluation_statistics` dictionary with evaluation performance
            statistics on the test_set,
        `training_statistics` is a dictionary of training statistics for
            each output
        feature containing loss and metrics values for each epoch,
        `preprocessed_data` tuple containing preprocessed
        `(training_set, validation_set, test_set)`, `output_directory`
        filepath string to where results are stored.

    """
    backend = initialize_backend(backend)

    config = check_which_config(config, config_file)

    if model_load_path:
        model = LudwigModel.load(model_load_path)
    else:
        model = LudwigModel(
            config=config,
            logging_level=logging_level,
            backend=backend,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
        )
    (eval_stats, train_stats, preprocessed_data,
     output_directory) = model.experiment(
         dataset=dataset,
         training_set=training_set,
         validation_set=validation_set,
         test_set=test_set,
         training_set_metadata=training_set_metadata,
         data_format=data_format,
         experiment_name=experiment_name,
         model_name=model_name,
         model_resume_path=model_resume_path,
         eval_split=eval_split,
         skip_save_training_description=skip_save_training_description,
         skip_save_training_statistics=skip_save_training_statistics,
         skip_save_model=skip_save_model,
         skip_save_progress=skip_save_progress,
         skip_save_log=skip_save_log,
         skip_save_processed_input=skip_save_processed_input,
         skip_save_unprocessed_output=skip_save_unprocessed_output,
         skip_save_predictions=skip_save_predictions,
         skip_save_eval_stats=skip_save_eval_stats,
         skip_collect_predictions=skip_collect_predictions,
         skip_collect_overall_stats=skip_collect_overall_stats,
         output_directory=output_directory,
         random_seed=random_seed,
         debug=debug,
     )

    return model, eval_stats, train_stats, preprocessed_data, output_directory
Ejemplo n.º 6
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script trains and evaluates a model',
        prog='ludwig experiment',
        usage='%(prog)s [options]')

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument('--output_directory',
                        type=str,
                        default='results',
                        help='directory that contains the results')
    parser.add_argument('--experiment_name',
                        type=str,
                        default='experiment',
                        help='experiment name')
    parser.add_argument('--model_name',
                        type=str,
                        default='run',
                        help='name for the model')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        '--dataset',
        help='input data file path. '
        'If it has a split column, it will be used for splitting '
        '(0: train, 1: validation, 2: test), '
        'otherwise the dataset will be randomly split')
    parser.add_argument('--training_set', help='input train data file path')
    parser.add_argument('--validation_set',
                        help='input validation data file path')
    parser.add_argument('--test_set', help='input test data file path')

    parser.add_argument(
        '--training_set_metadata',
        help='input metadata JSON file path. An intermediate preprocessed file '
        'containing the mappings of the input file created '
        'the first time a file is used, in the same directory '
        'with the same name and a .json extension')

    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html'
                            'tables', 'json', 'jsonl', 'parquet', 'pickle',
                            'sas', 'spss', 'stata', 'tsv'
                        ])

    parser.add_argument('-es',
                        '--eval_split',
                        default=TEST,
                        choices=[TRAINING, VALIDATION, TEST, FULL],
                        help='the split to evaluate the model on')

    parser.add_argument('-sspi',
                        '--skip_save_processed_input',
                        help='skips saving intermediate HDF5 and JSON files',
                        action='store_true',
                        default=False)
    parser.add_argument('-ssuo',
                        '--skip_save_unprocessed_output',
                        help='skips saving intermediate NPY output files',
                        action='store_true',
                        default=False)

    # -----------------
    # K-fold parameters
    # -----------------
    parser.add_argument(
        '-kf',
        '--k_fold',
        type=int,
        default=None,
        help='number of folds for a k-fold cross validation run ')
    parser.add_argument(
        '-skfsi',
        '--skip_save_k_fold_split_indices',
        action='store_true',
        default=False,
        help='disables saving indices generated to split training data set '
        'for the k-fold cross validation run, but if it is not needed '
        'turning it off can slightly increase the overall speed')

    # ----------------
    # Model parameters
    # ----------------
    config = parser.add_mutually_exclusive_group(required=True)
    config.add_argument('-c', '--config', type=yaml.safe_load, help='config')
    config.add_argument(
        '-cf',
        '--config_file',
        help='YAML file describing the model. Ignores --model_hyperparameters')

    parser.add_argument(
        '-mlp',
        '--model_load_path',
        help='path of a pretrained model to load as initialization')
    parser.add_argument(
        '-mrp',
        '--model_resume_path',
        help='path of the model directory to resume training of')
    parser.add_argument('-sstd',
                        '--skip_save_training_description',
                        action='store_true',
                        default=False,
                        help='disables saving the description JSON file')
    parser.add_argument('-ssts',
                        '--skip_save_training_statistics',
                        action='store_true',
                        default=False,
                        help='disables saving training statistics JSON file')
    parser.add_argument('-sstp',
                        '--skip_save_predictions',
                        help='skips saving test predictions CSV files',
                        action='store_true',
                        default=False)
    parser.add_argument('-sstes',
                        '--skip_save_eval_stats',
                        help='skips saving eval statistics JSON file',
                        action='store_true',
                        default=False)
    parser.add_argument(
        '-ssm',
        '--skip_save_model',
        action='store_true',
        default=False,
        help='disables saving model weights and hyperparameters each time '
        'the model improves. '
        'By default Ludwig saves model weights after each epoch '
        'the validation metric imprvoes, but if the model is really big '
        'that can be time consuming. If you do not want to keep '
        'the weights and just find out what performance a model can get '
        'with a set of hyperparameters, use this parameter to skip it,'
        'but the model will not be loadable later on')
    parser.add_argument(
        '-ssp',
        '--skip_save_progress',
        action='store_true',
        default=False,
        help='disables saving progress each epoch. By default Ludwig saves '
        'weights and stats after each epoch for enabling resuming '
        'of training, but if the model is really big that can be '
        'time consuming and will uses twice as much space, use '
        'this parameter to skip it, but training cannot be resumed '
        'later on')
    parser.add_argument(
        '-ssl',
        '--skip_save_log',
        action='store_true',
        default=False,
        help='disables saving TensorBoard logs. By default Ludwig saves '
        'logs for the TensorBoard, but if it is not needed turning it off '
        'can slightly increase the overall speed')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        '-rs',
        '--random_seed',
        type=int,
        default=42,
        help='a random seed that is going to be used anywhere there is a call '
        'to a random number generator: data splitting, parameter '
        'initialization and training set shuffling')
    parser.add_argument('-g',
                        '--gpus',
                        nargs='+',
                        type=int,
                        default=None,
                        help='list of GPUs to use')
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        '-dpt',
        '--disable_parallel_threads',
        action='store_false',
        dest='allow_parallel_threads',
        help='disable TensorFlow from using multithreading for reproducibility'
    )
    parser.add_argument(
        "-b",
        "--backend",
        help='specifies backend to use for parallel / distributed execution, '
        'defaults to local execution or Horovod if called using horovodrun',
        choices=ALL_BACKENDS,
    )
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.experiment')

    args.backend = initialize_backend(args.backend)
    if args.backend.is_coordinator():
        print_ludwig('Experiment', LUDWIG_VERSION)

    if args.k_fold is None:
        experiment_cli(**vars(args))
    else:
        kfold_cross_validate_cli(**vars(args))
Ejemplo n.º 7
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script loads a pretrained model '
        'and uses it to predict',
        prog='ludwig predict',
        usage='%(prog)s [options]')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument('--dataset',
                        help='input data file path',
                        required=True)
    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html', 'tables', 'json', 'jsonl', 'parquet',
                            'pickle', 'sas', 'spss', 'stata', 'tsv'
                        ])
    parser.add_argument('-s',
                        '--split',
                        default=FULL,
                        choices=[TRAINING, VALIDATION, TEST, FULL],
                        help='the split to test the model on')

    # ----------------
    # Model parameters
    # ----------------
    parser.add_argument('-m',
                        '--model_path',
                        help='model to load',
                        required=True)

    # -------------------------
    # Output results parameters
    # -------------------------
    parser.add_argument('-od',
                        '--output_directory',
                        type=str,
                        default='results',
                        help='directory that contains the results')
    parser.add_argument('-ssuo',
                        '--skip_save_unprocessed_output',
                        help='skips saving intermediate NPY output files',
                        action='store_true',
                        default=False)
    parser.add_argument('-sstp',
                        '--skip_save_predictions',
                        help='skips saving predictions CSV files',
                        action='store_true',
                        default=False)

    # ------------------
    # Generic parameters
    # ------------------
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=128,
                        help='size of batches')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument('-g',
                        '--gpus',
                        type=int,
                        default=0,
                        help='list of gpu to use')
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        '-dpt',
        '--disable_parallel_threads',
        action='store_false',
        dest='allow_parallel_threads',
        help='disable TensorFlow from using multithreading for reproducibility'
    )
    parser.add_argument(
        "-b",
        "--backend",
        help='specifies backend to use for parallel / distributed execution, '
        'defaults to local execution or Horovod if called using horovodrun',
        choices=ALL_BACKENDS,
    )
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    add_contrib_callback_args(parser)
    args = parser.parse_args(sys_argv)

    args.callbacks = args.callbacks or []
    for callback in args.callbacks:
        callback.on_cmdline('predict', *sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.predict')

    args.backend = initialize_backend(args.backend)
    if args.backend.is_coordinator():
        print_ludwig('Predict', LUDWIG_VERSION)
        logger.info('Dataset path: {}'.format(args.dataset))
        logger.info('Model path: {}'.format(args.model_path))
        logger.info('')

    predict_cli(**vars(args))
Ejemplo n.º 8
0
def hyperopt(
    config: Union[str, dict],
    dataset: Union[str, dict, pd.DataFrame] = None,
    training_set: Union[str, dict, pd.DataFrame] = None,
    validation_set: Union[str, dict, pd.DataFrame] = None,
    test_set: Union[str, dict, pd.DataFrame] = None,
    training_set_metadata: Union[str, dict] = None,
    data_format: str = None,
    experiment_name: str = 'hyperopt',
    model_name: str = 'run',
    skip_save_training_description: bool = False,
    skip_save_training_statistics: bool = False,
    skip_save_model: bool = False,
    skip_save_progress: bool = False,
    skip_save_log: bool = False,
    skip_save_processed_input: bool = True,
    skip_save_unprocessed_output: bool = False,
    skip_save_predictions: bool = False,
    skip_save_eval_stats: bool = False,
    skip_save_hyperopt_statistics: bool = False,
    output_directory: str = 'results',
    gpus: Union[str, int, List[int]] = None,
    gpu_memory_limit: int = None,
    allow_parallel_threads: bool = True,
    backend: Union[Backend, str] = None,
    random_seed: int = default_random_seed,
    debug: bool = False,
    **kwargs,
) -> List[dict]:
    """This method performs an hyperparameter optimization.

    # Inputs

    :param config: (Union[str, dict]) config which defines
        the different parameters of the model, features, preprocessing and
        training.  If `str`, filepath to yaml configuration file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the experiment.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocessed
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming. If you do not want to keep
        the weights and just find out what performance a model can get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
    :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files.
    :param skip_save_eval_stats: (bool, default: `False`) skips saving test
        statistics JSON file.
    :param skip_save_hyperopt_statistics: (bool, default: `False`) skips saving
        hyperopt stats file.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.

    # Return

    :return: (List[dict]) The results for the hyperparameter optimization
    """
    backend = initialize_backend(backend)

    # check if config is a path or a dict
    if isinstance(config, str):  # assume path
        with open(config, 'r') as def_file:
            config_dict = yaml.safe_load(def_file)
    else:
        config_dict = config

    # merge config with defaults
    config = merge_with_defaults(config_dict)

    if HYPEROPT not in config:
        raise ValueError("Hyperopt Section not present in config")

    hyperopt_config = config["hyperopt"]

    update_hyperopt_params_with_defaults(hyperopt_config)

    # print hyperopt config
    logger.info(pformat(hyperopt_config, indent=4))
    logger.info('\n')

    sampler = hyperopt_config["sampler"]
    executor = hyperopt_config["executor"]
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    ######################
    # check validity of output_feature / metric/ split combination
    ######################
    if split == TRAINING:
        if training_set is None and (
                config['preprocessing']['split_probabilities'][0] <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    elif split == VALIDATION:
        if validation_set is None and (
                config['preprocessing']['split_probabilities'][1] <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    elif split == TEST:
        if test_set is None and (
                config['preprocessing']['split_probabilities'][2] <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    else:
        raise ValueError('unrecognized hyperopt split "{}". '
                         'Please provide one of: {}'.format(
                             split, {TRAINING, VALIDATION, TEST}))
    if output_feature == COMBINED:
        if metric != LOSS:
            raise ValueError(
                'The only valid metric for "combined" output feature is "loss"'
            )
    else:
        output_feature_names = set(of['name']
                                   for of in config['output_features'])
        if output_feature not in output_feature_names:
            raise ValueError('The output feature specified for hyperopt "{}" '
                             'cannot be found in the config. '
                             'Available ones are: {} and "combined"'.format(
                                 output_feature, output_feature_names))

        output_feature_type = None
        for of in config['output_features']:
            if of['name'] == output_feature:
                output_feature_type = of[TYPE]
        feature_class = get_from_registry(output_feature_type,
                                          output_type_registry)
        if metric not in feature_class.metric_functions:
            # todo v0.4: allow users to specify also metrics from the overall
            #  and per class metrics from the trainign stats and in general
            #  and potprocessed metric
            raise ValueError(
                'The specified metric for hyperopt "{}" is not a valid metric '
                'for the specified output feature "{}" of type "{}". '
                'Available metrics are: {}'.format(
                    metric, output_feature, output_feature_type,
                    feature_class.metric_functions.keys()))

    hyperopt_sampler = get_build_hyperopt_sampler(sampler[TYPE])(goal,
                                                                 parameters,
                                                                 **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor[TYPE])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_results = hyperopt_executor.execute(
        config,
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        backend=backend,
        random_seed=random_seed,
        debug=debug,
        **kwargs)

    if backend.is_coordinator():
        print_hyperopt_results(hyperopt_results)

        if not skip_save_hyperopt_statistics:
            if not os.path.exists(output_directory):
                os.makedirs(output_directory)

            hyperopt_stats = {
                'hyperopt_config': hyperopt_config,
                'hyperopt_results': hyperopt_results
            }

            save_hyperopt_stats(hyperopt_stats, output_directory)
            logger.info('Hyperopt stats saved to: {}'.format(output_directory))

    logger.info('Finished hyperopt')

    return hyperopt_results
Ejemplo n.º 9
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script preprocess a dataset',
        prog='ludwig preprocess',
        usage='%(prog)s [options]')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        '--dataset',
        help='input data file path. '
        'If it has a split column, it will be used for splitting '
        '(0: train, 1: validation, 2: test), '
        'otherwise the dataset will be randomly split')
    parser.add_argument('--training_set', help='input train data file path')
    parser.add_argument('--validation_set',
                        help='input validation data file path')
    parser.add_argument('--test_set', help='input test data file path')

    parser.add_argument(
        '--training_set_metadata',
        help='input metadata JSON file path. An intermediate preprocessed file '
        'containing the mappings of the input file created '
        'the first time a file is used, in the same directory '
        'with the same name and a .json extension')

    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html'
                            'tables', 'json', 'jsonl', 'parquet', 'pickle',
                            'sas', 'spss', 'stata', 'tsv'
                        ])

    # ----------------
    # Model parameters
    # ----------------
    preprocessing_def = parser.add_mutually_exclusive_group(required=True)
    preprocessing_def.add_argument(
        '-pc',
        '--preprocessing_config',
        type=yaml.safe_load,
        help='preproceesing config. '
        'Uses the same format of config, '
        'but ignores encoder specific parameters, '
        'decoder specific paramters, combiner and training parameters')
    preprocessing_def.add_argument(
        '-pcf',
        '--preprocessing_config_file',
        dest='preprocessing_config',
        type=load_yaml,
        help='YAML file describing the preprocessing. '
        'Ignores --preprocessing_config.'
        'Uses the same format of config, '
        'but ignores encoder specific parameters, '
        'decoder specific paramters, combiner and training parameters')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        '-rs',
        '--random_seed',
        type=int,
        default=42,
        help='a random seed that is going to be used anywhere there is a call '
        'to a random number generator: data splitting, parameter '
        'initialization and training set shuffling')
    parser.add_argument(
        "-b",
        "--backend",
        help='specifies backend to use for parallel / distributed execution, '
        'defaults to local execution or Horovod if called using horovodrun',
        choices=ALL_BACKENDS,
    )
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    add_contrib_callback_args(parser)
    args = parser.parse_args(sys_argv)

    args.callbacks = args.callbacks or []
    for callback in args.callbacks:
        callback.on_cmdline('preprocess', *sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.preprocess')

    args.backend = initialize_backend(args.backend)
    if args.backend.is_coordinator():
        print_ludwig('Preprocess', LUDWIG_VERSION)

    preprocess_cli(**vars(args))
Ejemplo n.º 10
0
backend_config = {
    "type": "ray",
    "processor": {
        "parallelism": 6,
        "type": "dask",
    },
    "trainer": {
        "use_gpu": False,
        "num_workers": 3,
        "resources_per_worker": {
            "CPU": 2,
            "GPU": 0,
        },
    },
}
backend = initialize_backend(backend_config)
model = LudwigModel(config="./config.yaml", logging_level=logging.INFO, backend=backend)

df = adult_census_income.load(split=False)

(
    train_stats,  # dictionary containing training statistics
    preprocessed_data,  # tuple Ludwig Dataset objects of pre-processed training data
    output_directory,  # location of training results stored on disk
) = model.train(
    dataset=df,
    skip_save_processed_input=True,
)

print("contents of output directory:", output_directory)
for item in os.listdir(output_directory):
Ejemplo n.º 11
0
def hyperopt(
    config: Union[str, dict],
    dataset: Union[str, dict, pd.DataFrame] = None,
    training_set: Union[str, dict, pd.DataFrame] = None,
    validation_set: Union[str, dict, pd.DataFrame] = None,
    test_set: Union[str, dict, pd.DataFrame] = None,
    training_set_metadata: Union[str, dict] = None,
    data_format: str = None,
    experiment_name: str = "hyperopt",
    model_name: str = "run",
    resume: Optional[bool] = None,
    skip_save_training_description: bool = False,
    skip_save_training_statistics: bool = False,
    skip_save_model: bool = False,
    skip_save_progress: bool = False,
    skip_save_log: bool = False,
    skip_save_processed_input: bool = True,
    skip_save_unprocessed_output: bool = False,
    skip_save_predictions: bool = False,
    skip_save_eval_stats: bool = False,
    skip_save_hyperopt_statistics: bool = False,
    output_directory: str = "results",
    gpus: Union[str, int, List[int]] = None,
    gpu_memory_limit: int = None,
    allow_parallel_threads: bool = True,
    callbacks: List[Callback] = None,
    backend: Union[Backend, str] = None,
    random_seed: int = default_random_seed,
    hyperopt_log_verbosity: int = 3,
    **kwargs,
) -> HyperoptResults:
    """This method performs an hyperparameter optimization.

    # Inputs

    :param config: (Union[str, dict]) config which defines
        the different parameters of the model, features, preprocessing and
        training.  If `str`, filepath to yaml configuration file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the experiment.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocessed
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param resume: (bool) If true, continue hyperopt from the state of the previous
        run in the output directory with the same experiment name. If false, will create
        new trials, ignoring any previous state, even if they exist in the output_directory.
        By default, will attempt to resume if there is already an existing experiment with
        the same name, and will create new trials if not.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming. If you do not want to keep
        the weights and just find out what performance a model can get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
    :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files.
    :param skip_save_eval_stats: (bool, default: `False`) skips saving test
        statistics JSON file.
    :param skip_save_hyperopt_statistics: (bool, default: `False`) skips saving
        hyperopt stats file.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param callbacks: (list, default: `None`) a list of
        `ludwig.callbacks.Callback` objects that provide hooks into the
        Ludwig pipeline.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param hyperopt_log_verbosity: (int: default: 3) controls verbosity of
        ray tune log messages.  Valid values: 0 = silent, 1 = only status updates,
        2 = status and brief trial results, 3 = status and detailed trial results.

    # Return

    :return: (List[dict]) List of results for each trial, ordered by
        descending performance on the target metric.
    """
    from ludwig.hyperopt.execution import get_build_hyperopt_executor, RayTuneExecutor

    # check if config is a path or a dict
    if isinstance(config, str):  # assume path
        with open_file(config, "r") as def_file:
            config_dict = yaml.safe_load(def_file)
    else:
        config_dict = config

    # Get mapping of input/output features that don't have an encoder for shared parameters
    features_eligible_for_shared_params = {
        INPUT_FEATURES:
        get_features_eligible_for_shared_params(config_dict, INPUT_FEATURES),
        OUTPUT_FEATURES:
        get_features_eligible_for_shared_params(config_dict, OUTPUT_FEATURES),
    }

    # merge config with defaults
    config = merge_with_defaults(config_dict)

    if HYPEROPT not in config:
        raise ValueError("Hyperopt Section not present in config")

    hyperopt_config = config[HYPEROPT]

    update_hyperopt_params_with_defaults(hyperopt_config)

    # print hyperopt config
    logging.info("Hyperopt config")
    logging.info(pformat(hyperopt_config, indent=4))
    logging.info("\n")

    logging.info(
        "Features that may be updated in hyperopt trials if default parameters are specified in the search space"
    )
    logging.info(pformat(dict(features_eligible_for_shared_params), indent=4))
    logging.info("\n")

    search_alg = hyperopt_config["search_alg"]
    executor = hyperopt_config[EXECUTOR]
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    ######################
    # check validity of output_feature / metric/ split combination
    ######################
    splitter = get_splitter(**config[PREPROCESSING]["split"])
    if split == TRAINING:
        if training_set is None and not splitter.has_split(0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                "was not provided, "
                "or the split amount specified in the preprocessing section "
                "of the config is not greater than 0".format(split))
    elif split == VALIDATION:
        if validation_set is None and not splitter.has_split(1):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                "was not provided, "
                "or the split amount specified in the preprocessing section "
                "of the config is not greater than 0".format(split))
    elif split == TEST:
        if test_set is None and not splitter.has_split(2):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                "was not provided, "
                "or the split amount specified in the preprocessing section "
                "of the config is not greater than 0".format(split))
    else:
        raise ValueError('unrecognized hyperopt split "{}". '
                         "Please provide one of: {}".format(
                             split, {TRAINING, VALIDATION, TEST}))
    if output_feature == COMBINED:
        if metric != LOSS:
            raise ValueError(
                'The only valid metric for "combined" output feature is "loss"'
            )
    else:
        output_feature_names = {of[NAME] for of in config[OUTPUT_FEATURES]}
        if output_feature not in output_feature_names:
            raise ValueError('The output feature specified for hyperopt "{}" '
                             "cannot be found in the config. "
                             'Available ones are: {} and "combined"'.format(
                                 output_feature, output_feature_names))

        output_feature_type = None
        for of in config[OUTPUT_FEATURES]:
            if of[NAME] == output_feature:
                output_feature_type = of[TYPE]
        feature_class = get_from_registry(output_feature_type,
                                          output_type_registry)
        if metric not in feature_class.metric_functions:
            # todo v0.4: allow users to specify also metrics from the overall
            #  and per class metrics from the trainign stats and in general
            #  and potprocessed metric
            raise ValueError(
                'The specified metric for hyperopt "{}" is not a valid metric '
                'for the specified output feature "{}" of type "{}". '
                "Available metrics are: {}".format(
                    metric, output_feature, output_feature_type,
                    feature_class.metric_functions.keys()))

    hyperopt_executor = get_build_hyperopt_executor(executor[TYPE])(
        parameters,
        output_feature,
        metric,
        goal,
        split,
        search_alg=search_alg,
        **executor)

    # Explicitly default to a local backend to avoid picking up Ray or Horovod
    # backend from the environment.
    backend = backend or config_dict.get("backend") or "local"
    backend = initialize_backend(backend)
    if not (isinstance(backend, LocalBackend) or
            (isinstance(hyperopt_executor, RayTuneExecutor)
             and isinstance(backend, RayBackend))):
        raise ValueError(
            "Hyperopt requires using a `local` backend at this time, or "
            "`ray` backend with `ray` executor.")

    for callback in callbacks or []:
        callback.on_hyperopt_init(experiment_name)

    if not should_tune_preprocessing(config):
        # preprocessing is not being tuned, so generate it once before starting trials
        for callback in callbacks or []:
            callback.on_hyperopt_preprocessing_start(experiment_name)

        model = LudwigModel(
            config=config,
            backend=backend,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
        )

        training_set, validation_set, test_set, training_set_metadata = model.preprocess(
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=skip_save_processed_input,
            random_seed=random_seed,
        )
        dataset = None

        for callback in callbacks or []:
            callback.on_hyperopt_preprocessing_end(experiment_name)

    for callback in callbacks or []:
        callback.on_hyperopt_start(experiment_name)

    hyperopt_results = hyperopt_executor.execute(
        config,
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        resume=resume,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        callbacks=callbacks,
        backend=backend,
        random_seed=random_seed,
        hyperopt_log_verbosity=hyperopt_log_verbosity,
        features_eligible_for_shared_params=features_eligible_for_shared_params,
        **kwargs,
    )

    if backend.is_coordinator():
        print_hyperopt_results(hyperopt_results)

        if not skip_save_hyperopt_statistics:
            results_directory = os.path.join(output_directory, experiment_name)
            makedirs(results_directory, exist_ok=True)

            hyperopt_stats = {
                "hyperopt_config":
                hyperopt_config,
                "hyperopt_results":
                [t.to_dict() for t in hyperopt_results.ordered_trials],
            }

            save_hyperopt_stats(hyperopt_stats, results_directory)
            logging.info(f"Hyperopt stats saved to: {results_directory}")

    for callback in callbacks or []:
        callback.on_hyperopt_end(experiment_name)
        callback.on_hyperopt_finish(experiment_name)

    logging.info("Finished hyperopt")

    return hyperopt_results
Ejemplo n.º 12
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description="This script trains and evaluates a model", prog="ludwig experiment", usage="%(prog)s [options]"
    )

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument("--output_directory", type=str, default="results", help="directory that contains the results")
    parser.add_argument("--experiment_name", type=str, default="experiment", help="experiment name")
    parser.add_argument("--model_name", type=str, default="run", help="name for the model")

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        "--dataset",
        help="input data file path. "
        "If it has a split column, it will be used for splitting "
        "(0: train, 1: validation, 2: test), "
        "otherwise the dataset will be randomly split",
    )
    parser.add_argument("--training_set", help="input train data file path")
    parser.add_argument("--validation_set", help="input validation data file path")
    parser.add_argument("--test_set", help="input test data file path")

    parser.add_argument(
        "--training_set_metadata",
        help="input metadata JSON file path. An intermediate preprocessed file "
        "containing the mappings of the input file created "
        "the first time a file is used, in the same directory "
        "with the same name and a .json extension",
    )

    parser.add_argument(
        "--data_format",
        help="format of the input data",
        default="auto",
        choices=[
            "auto",
            "csv",
            "excel",
            "feather",
            "fwf",
            "hdf5",
            "html" "tables",
            "json",
            "jsonl",
            "parquet",
            "pickle",
            "sas",
            "spss",
            "stata",
            "tsv",
        ],
    )

    parser.add_argument(
        "-es",
        "--eval_split",
        default=TEST,
        choices=[TRAINING, VALIDATION, TEST, FULL],
        help="the split to evaluate the model on",
    )

    parser.add_argument(
        "-sspi",
        "--skip_save_processed_input",
        help="skips saving intermediate HDF5 and JSON files",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-ssuo",
        "--skip_save_unprocessed_output",
        help="skips saving intermediate NPY output files",
        action="store_true",
        default=False,
    )

    # -----------------
    # K-fold parameters
    # -----------------
    parser.add_argument(
        "-kf", "--k_fold", type=int, default=None, help="number of folds for a k-fold cross validation run "
    )
    parser.add_argument(
        "-skfsi",
        "--skip_save_k_fold_split_indices",
        action="store_true",
        default=False,
        help="disables saving indices generated to split training data set "
        "for the k-fold cross validation run, but if it is not needed "
        "turning it off can slightly increase the overall speed",
    )

    # ----------------
    # Model parameters
    # ----------------
    config = parser.add_mutually_exclusive_group(required=True)
    config.add_argument(
        "-c", "--config", type=load_config_from_str, help="JSON or YAML serialized string of the model configuration"
    )
    config.add_argument(
        "-cf",
        "--config_file",
        dest="config",
        type=load_yaml,
        help="Path to the YAML file containing the model configuration",
    )

    parser.add_argument("-mlp", "--model_load_path", help="path of a pretrained model to load as initialization")
    parser.add_argument("-mrp", "--model_resume_path", help="path of the model directory to resume training of")
    parser.add_argument(
        "-sstd",
        "--skip_save_training_description",
        action="store_true",
        default=False,
        help="disables saving the description JSON file",
    )
    parser.add_argument(
        "-ssts",
        "--skip_save_training_statistics",
        action="store_true",
        default=False,
        help="disables saving training statistics JSON file",
    )
    parser.add_argument(
        "-sstp",
        "--skip_save_predictions",
        help="skips saving test predictions CSV files",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-sstes",
        "--skip_save_eval_stats",
        help="skips saving eval statistics JSON file",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-ssm",
        "--skip_save_model",
        action="store_true",
        default=False,
        help="disables saving model weights and hyperparameters each time "
        "the model improves. "
        "By default Ludwig saves model weights after each epoch "
        "the validation metric imprvoes, but if the model is really big "
        "that can be time consuming. If you do not want to keep "
        "the weights and just find out what performance a model can get "
        "with a set of hyperparameters, use this parameter to skip it,"
        "but the model will not be loadable later on",
    )
    parser.add_argument(
        "-ssp",
        "--skip_save_progress",
        action="store_true",
        default=False,
        help="disables saving progress each epoch. By default Ludwig saves "
        "weights and stats after each epoch for enabling resuming "
        "of training, but if the model is really big that can be "
        "time consuming and will uses twice as much space, use "
        "this parameter to skip it, but training cannot be resumed "
        "later on",
    )
    parser.add_argument(
        "-ssl",
        "--skip_save_log",
        action="store_true",
        default=False,
        help="disables saving TensorBoard logs. By default Ludwig saves "
        "logs for the TensorBoard, but if it is not needed turning it off "
        "can slightly increase the overall speed",
    )

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        "-rs",
        "--random_seed",
        type=int,
        default=42,
        help="a random seed that is going to be used anywhere there is a call "
        "to a random number generator: data splitting, parameter "
        "initialization and training set shuffling",
    )
    parser.add_argument("-g", "--gpus", nargs="+", type=int, default=None, help="list of GPUs to use")
    parser.add_argument(
        "-gml", "--gpu_memory_limit", type=int, default=None, help="maximum memory in MB to allocate per GPU device"
    )
    parser.add_argument(
        "-dpt",
        "--disable_parallel_threads",
        action="store_false",
        dest="allow_parallel_threads",
        help="disable TensorFlow from using multithreading for reproducibility",
    )
    parser.add_argument(
        "-b",
        "--backend",
        help="specifies backend to use for parallel / distributed execution, "
        "defaults to local execution or Horovod if called using horovodrun",
        choices=ALL_BACKENDS,
    )
    parser.add_argument("-dbg", "--debug", action="store_true", default=False, help="enables debugging mode")
    parser.add_argument(
        "-l",
        "--logging_level",
        default="info",
        help="the level of logging to use",
        choices=["critical", "error", "warning", "info", "debug", "notset"],
    )

    add_contrib_callback_args(parser)
    args = parser.parse_args(sys_argv)

    args.callbacks = args.callbacks or []
    for callback in args.callbacks:
        callback.on_cmdline("experiment", *sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger("ludwig").setLevel(args.logging_level)
    global logger
    logger = logging.getLogger("ludwig.experiment")

    args.backend = initialize_backend(args.backend or args.config.get("backend"))
    if args.backend.is_coordinator():
        print_ludwig("Experiment", LUDWIG_VERSION)

    if args.k_fold is None:
        experiment_cli(**vars(args))
    else:
        kfold_cross_validate_cli(**vars(args))
Ejemplo n.º 13
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description="This script searches for optimal Hyperparameters",
        prog="ludwig hyperopt",
        usage="%(prog)s [options]",
    )

    # -------------------
    # Hyperopt parameters
    # -------------------
    parser.add_argument(
        "-sshs",
        "--skip_save_hyperopt_statistics",
        help="skips saving hyperopt statistics file",
        action="store_true",
        default=False,
    )

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument(
        "--output_directory",
        type=str,
        default="results",
        help="directory that contains the results",
    )
    parser.add_argument("--experiment_name",
                        type=str,
                        default="hyperopt",
                        help="experiment name")
    parser.add_argument("--model_name",
                        type=str,
                        default="run",
                        help="name for the model")

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        '--dataset',
        help='input data file path. '
        'If it has a split column, it will be used for splitting '
        '(0: train, 1: validation, 2: test), '
        'otherwise the dataset will be randomly split')
    parser.add_argument('--training_set', help='input train data file path')
    parser.add_argument('--validation_set',
                        help='input validation data file path')
    parser.add_argument('--test_set', help='input test data file path')

    parser.add_argument(
        '--training_set_metadata',
        help='input metadata JSON file path. An intermediate preprocessed file '
        'containing the mappings of the input file created '
        'the first time a file is used, in the same directory '
        'with the same name and a .json extension')

    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html'
                            'tables', 'json', 'jsonl', 'parquet', 'pickle',
                            'sas', 'spss', 'stata', 'tsv'
                        ])

    parser.add_argument(
        "-sspi",
        "--skip_save_processed_input",
        help="skips saving intermediate HDF5 and JSON files",
        action="store_true",
        default=False,
    )

    # ----------------
    # Model parameters
    # ----------------
    config = parser.add_mutually_exclusive_group(required=True)
    config.add_argument("-c", "--config", type=yaml.safe_load, help="config")
    config.add_argument(
        "-cf",
        "--config_file",
        help="YAML file describing the model. Ignores --model_hyperparameters",
    )

    parser.add_argument(
        "-mlp",
        "--model_load_path",
        help="path of a pretrained model to load as initialization",
    )
    parser.add_argument(
        "-mrp",
        "--model_resume_path",
        help="path of the model directory to resume training of",
    )
    parser.add_argument(
        "-sstd",
        "--skip_save_training_description",
        action="store_true",
        default=False,
        help="disables saving the description JSON file",
    )
    parser.add_argument(
        "-ssts",
        "--skip_save_training_statistics",
        action="store_true",
        default=False,
        help="disables saving training statistics JSON file",
    )
    parser.add_argument(
        "-ssm",
        "--skip_save_model",
        action="store_true",
        default=False,
        help="disables saving weights each time the model improves. "
        "By default Ludwig saves  weights after each epoch "
        "the validation metric imrpvoes, but  if the model is really big "
        "that can be time consuming. If you do not want to keep "
        "the weights and just find out what performance a model can get "
        "with a set of hyperparameters, use this parameter to skip it",
    )
    parser.add_argument(
        "-ssp",
        "--skip_save_progress",
        action="store_true",
        default=False,
        help="disables saving weights after each epoch. By default ludwig saves "
        "weights after each epoch for enabling resuming of training, but "
        "if the model is really big that can be time consuming and will "
        "save twice as much space, use this parameter to skip it",
    )
    parser.add_argument(
        "-ssl",
        "--skip_save_log",
        action="store_true",
        default=False,
        help="disables saving TensorBoard logs. By default Ludwig saves "
        "logs for the TensorBoard, but if it is not needed turning it off "
        "can slightly increase the overall speed",
    )

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        "-rs",
        "--random_seed",
        type=int,
        default=42,
        help="a random seed that is going to be used anywhere there is a call "
        "to a random number generator: data splitting, parameter "
        "initialization and training set shuffling",
    )
    parser.add_argument("-g",
                        "--gpus",
                        nargs="+",
                        type=int,
                        default=None,
                        help="list of gpus to use")
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        "-b",
        "--backend",
        help='specifies backend to use for parallel / distributed execution, '
        'defaults to local execution or Horovod if called using horovodrun',
        choices=ALL_BACKENDS,
    )
    parser.add_argument(
        "-dbg",
        "--debug",
        action="store_true",
        default=False,
        help="enables debugging mode",
    )
    parser.add_argument(
        "-l",
        "--logging_level",
        default="info",
        help="the level of logging to use",
        choices=["critical", "error", "warning", "info", "debug", "notset"],
    )

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.hyperopt')

    args.backend = initialize_backend(args.backend)
    if args.backend.is_coordinator():
        print_ludwig("Hyperopt", LUDWIG_VERSION)

    hyperopt_cli(**vars(args))
Ejemplo n.º 14
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description="This script preprocess a dataset",
        prog="ludwig preprocess",
        usage="%(prog)s [options]")

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        "--dataset",
        help="input data file path. "
        "If it has a split column, it will be used for splitting "
        "(0: train, 1: validation, 2: test), "
        "otherwise the dataset will be randomly split",
    )
    parser.add_argument("--training_set", help="input train data file path")
    parser.add_argument("--validation_set",
                        help="input validation data file path")
    parser.add_argument("--test_set", help="input test data file path")

    parser.add_argument(
        "--training_set_metadata",
        help="input metadata JSON file path. An intermediate preprocessed file "
        "containing the mappings of the input file created "
        "the first time a file is used, in the same directory "
        "with the same name and a .json extension",
    )

    parser.add_argument(
        "--data_format",
        help="format of the input data",
        default="auto",
        choices=[
            "auto",
            "csv",
            "excel",
            "feather",
            "fwf",
            "hdf5",
            "html"
            "tables",
            "json",
            "jsonl",
            "parquet",
            "pickle",
            "sas",
            "spss",
            "stata",
            "tsv",
        ],
    )

    # ----------------
    # Model parameters
    # ----------------
    preprocessing_def = parser.add_mutually_exclusive_group(required=True)
    preprocessing_def.add_argument(
        "-pc",
        "--preprocessing_config",
        dest="preprocessing_config",
        type=load_yaml,
        help="YAML file describing the preprocessing. "
        "Ignores --preprocessing_config."
        "Uses the same format of config, "
        "but ignores encoder specific parameters, "
        "decoder specific parameters, combiner and training parameters",
    )
    preprocessing_def.add_argument(
        "-pcs",
        "--preprocessing_config_str",
        type=yaml.safe_load,
        help="preproceesing config. "
        "Uses the same format of config, "
        "but ignores encoder specific parameters, "
        "decoder specific parameters, combiner and training parameters",
    )

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        "-rs",
        "--random_seed",
        type=int,
        default=42,
        help="a random seed that is going to be used anywhere there is a call "
        "to a random number generator: data splitting, parameter "
        "initialization and training set shuffling",
    )
    parser.add_argument(
        "-b",
        "--backend",
        help="specifies backend to use for parallel / distributed execution, "
        "defaults to local execution or Horovod if called using horovodrun",
        choices=ALL_BACKENDS,
    )
    parser.add_argument(
        "-l",
        "--logging_level",
        default="info",
        help="the level of logging to use",
        choices=["critical", "error", "warning", "info", "debug", "notset"],
    )

    add_contrib_callback_args(parser)
    args = parser.parse_args(sys_argv)

    args.callbacks = args.callbacks or []
    for callback in args.callbacks:
        callback.on_cmdline("preprocess", *sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger("ludwig").setLevel(args.logging_level)
    global logger
    logger = logging.getLogger("ludwig.preprocess")

    args.backend = initialize_backend(args.backend)
    if args.backend.is_coordinator():
        print_ludwig("Preprocess", LUDWIG_VERSION)

    preprocess_cli(**vars(args))
Ejemplo n.º 15
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description="This script loads a pretrained model "
        "and evaluates its performance by comparing"
        "its predictions with ground truth.",
        prog="ludwig evaluate",
        usage="%(prog)s [options]",
    )

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument("--dataset",
                        help="input data file path",
                        required=True)
    parser.add_argument(
        "--data_format",
        help="format of the input data",
        default="auto",
        choices=[
            "auto",
            "csv",
            "excel",
            "feather",
            "fwf",
            "hdf5",
            "html"
            "tables",
            "json",
            "jsonl",
            "parquet",
            "pickle",
            "sas",
            "spss",
            "stata",
            "tsv",
        ],
    )
    parser.add_argument("-s",
                        "--split",
                        default=FULL,
                        choices=[TRAINING, VALIDATION, TEST, FULL],
                        help="the split to test the model on")

    # ----------------
    # Model parameters
    # ----------------
    parser.add_argument("-m",
                        "--model_path",
                        help="model to load",
                        required=True)

    # -------------------------
    # Output results parameters
    # -------------------------
    parser.add_argument("-od",
                        "--output_directory",
                        type=str,
                        default="results",
                        help="directory that contains the results")
    parser.add_argument(
        "-ssuo",
        "--skip_save_unprocessed_output",
        help="skips saving intermediate NPY output files",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-sses",
        "--skip_save_eval_stats",
        help="skips saving intermediate JSON eval statistics",
        action="store_true",
        default=False,
    )
    parser.add_argument("-scp",
                        "--skip_collect_predictions",
                        help="skips collecting predictions",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "-scos",
        "--skip_collect_overall_stats",
        help="skips collecting overall stats",
        action="store_true",
        default=False,
    )

    # ------------------
    # Generic parameters
    # ------------------
    parser.add_argument("-bs",
                        "--batch_size",
                        type=int,
                        default=128,
                        help="size of batches")

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument("-g",
                        "--gpus",
                        type=int,
                        default=0,
                        help="list of gpu to use")
    parser.add_argument("-gml",
                        "--gpu_memory_limit",
                        type=int,
                        default=None,
                        help="maximum memory in MB to allocate per GPU device")
    parser.add_argument(
        "-dpt",
        "--disable_parallel_threads",
        action="store_false",
        dest="allow_parallel_threads",
        help="disable TensorFlow from using multithreading for reproducibility",
    )
    parser.add_argument(
        "-b",
        "--backend",
        help="specifies backend to use for parallel / distributed execution, "
        "defaults to local execution or Horovod if called using horovodrun",
        choices=ALL_BACKENDS,
    )
    parser.add_argument("-dbg",
                        "--debug",
                        action="store_true",
                        default=False,
                        help="enables debugging mode")
    parser.add_argument(
        "-l",
        "--logging_level",
        default="info",
        help="the level of logging to use",
        choices=["critical", "error", "warning", "info", "debug", "notset"],
    )

    add_contrib_callback_args(parser)
    args = parser.parse_args(sys_argv)
    args.evaluate_performance = True

    args.callbacks = args.callbacks or []
    for callback in args.callbacks:
        callback.on_cmdline("evaluate", *sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger("ludwig").setLevel(args.logging_level)
    global logger
    logger = logging.getLogger("ludwig.test_performance")

    args.backend = initialize_backend(args.backend)
    if args.backend.is_coordinator():
        print_ludwig("Evaluate", LUDWIG_VERSION)
        logger.info(f"Dataset path: {args.dataset}")
        logger.info(f"Model path: {args.model_path}")
        logger.info("")

    evaluate_cli(**vars(args))
Ejemplo n.º 16
0
    def execute(
        self,
        config,
        dataset=None,
        training_set=None,
        validation_set=None,
        test_set=None,
        training_set_metadata=None,
        data_format=None,
        experiment_name="hyperopt",
        model_name="run",
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=True,
        skip_save_unprocessed_output=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        output_directory="results",
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        callbacks=None,
        backend=None,
        random_seed=default_random_seed,
        debug=False,
        **kwargs,
    ) -> RayTuneResults:
        if isinstance(dataset, str) and not has_remote_protocol(dataset) and not os.path.isabs(dataset):
            dataset = os.path.abspath(dataset)

        if isinstance(backend, str):
            backend = initialize_backend(backend)

        if gpus is not None:
            raise ValueError(
                "Parameter `gpus` is not supported when using Ray Tune. "
                "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your "
                "hyperopt config."
            )

        if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1:
            # Enforce fractional GPU utilization
            gpu_memory_limit = self.gpu_resources_per_trial

        hyperopt_dict = dict(
            config=config,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            experiment_name=experiment_name,
            model_name=model_name,
            # model_load_path=model_load_path,
            # model_resume_path=model_resume_path,
            eval_split=self.split,
            skip_save_training_description=skip_save_training_description,
            skip_save_training_statistics=skip_save_training_statistics,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            skip_save_processed_input=skip_save_processed_input,
            skip_save_unprocessed_output=skip_save_unprocessed_output,
            skip_save_predictions=skip_save_predictions,
            skip_save_eval_stats=skip_save_eval_stats,
            output_directory=output_directory,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
            backend=backend,
            random_seed=random_seed,
            debug=debug,
        )

        mode = "min" if self.goal != MAXIMIZE else "max"
        metric = "metric_score"
        if self.search_alg_dict is not None:
            if TYPE not in self.search_alg_dict:
                logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.")
                search_alg = None
            else:
                search_alg_type = self.search_alg_dict[TYPE]
                search_alg = tune.create_searcher(search_alg_type, metric=metric, mode=mode, **self.search_alg_dict)
        else:
            search_alg = None

        if self.max_concurrent_trials:
            assert (
                self.max_concurrent_trials > 0
            ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}"
            if isinstance(search_alg, BasicVariantGenerator) or search_alg is None:
                search_alg = BasicVariantGenerator(max_concurrent=self.max_concurrent_trials)
            elif isinstance(search_alg, ConcurrencyLimiter):
                raise ValueError(
                    "You have specified `max_concurrent_trials`, but the search "
                    "algorithm is already a `ConcurrencyLimiter`. FIX THIS "
                    "by setting `max_concurrent_trials=None`."
                )
            else:
                search_alg = ConcurrencyLimiter(search_alg, max_concurrent=self.max_concurrent_trials)

        resources_per_trial = {
            "cpu": self._cpu_resources_per_trial_non_none,
            "gpu": self._gpu_resources_per_trial_non_none,
        }

        def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None):
            return self._run_experiment(
                config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, _is_ray_backend(backend)
            )

        tune_config = {}
        tune_callbacks = []
        for callback in callbacks or []:
            run_experiment_trial, tune_config = callback.prepare_ray_tune(
                run_experiment_trial,
                tune_config,
                tune_callbacks,
            )

        if _is_ray_backend(backend):
            # we can't set Trial actor's CPUs to 0 so we just go very low
            resources_per_trial = PlacementGroupFactory(
                [{"CPU": 0.001}] + ([{"CPU": 1, "GPU": 1}] * self._gpu_resources_per_trial_non_none)
                if self._gpu_resources_per_trial_non_none
                else [{"CPU": 0.001}] + [{"CPU": 1}] * self._cpu_resources_per_trial_non_none
            )

        if has_remote_protocol(output_directory):
            run_experiment_trial = tune.durable(run_experiment_trial)
            self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory)
            output_directory = None
        elif self.kubernetes_namespace:
            from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer

            self.sync_config = tune.SyncConfig(sync_to_driver=NamespacedKubernetesSyncer(self.kubernetes_namespace))

        run_experiment_trial_params = tune.with_parameters(run_experiment_trial, local_hyperopt_dict=hyperopt_dict)
        register_trainable(f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params)

        analysis = tune.run(
            f"trainable_func_f{hash_dict(config).decode('ascii')}",
            config={
                **self.search_space,
                **tune_config,
            },
            scheduler=self.scheduler,
            search_alg=search_alg,
            num_samples=self.num_samples,
            keep_checkpoints_num=1,
            max_failures=1,  # retry a trial failure once
            resources_per_trial=resources_per_trial,
            time_budget_s=self.time_budget_s,
            sync_config=self.sync_config,
            local_dir=output_directory,
            metric=metric,
            mode=mode,
            trial_name_creator=lambda trial: f"trial_{trial.trial_id}",
            trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}",
            callbacks=tune_callbacks,
        )

        if "metric_score" in analysis.results_df.columns:
            ordered_trials = analysis.results_df.sort_values("metric_score", ascending=self.goal != MAXIMIZE)

            # Catch nans in edge case where the trial doesn't complete
            temp_ordered_trials = []
            for kwargs in ordered_trials.to_dict(orient="records"):
                for key in ["parameters", "training_stats", "eval_stats"]:
                    if isinstance(kwargs[key], float):
                        kwargs[key] = {}
                temp_ordered_trials.append(kwargs)

            # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate
            # tune.report call(s) but were terminated before reporting eval_stats from post-train
            # evaluation (e.g., trial stopped due to time budget or relatively poor performance.)
            # For any such trials, run model evaluation for the best model in that trial & record
            # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json.
            for trial in temp_ordered_trials:
                if trial["eval_stats"] == "{}" and trial["training_stats"] != "{}":
                    # Evaluate the best model on the eval_split, which is validation_set
                    if validation_set is not None and validation_set.size > 0:
                        trial_path = trial["trial_dir"]
                        best_model_path = self._get_best_model_path(trial_path, analysis)
                        if best_model_path is not None:
                            self._evaluate_best_model(
                                trial,
                                trial_path,
                                best_model_path,
                                validation_set,
                                data_format,
                                skip_save_unprocessed_output,
                                skip_save_predictions,
                                skip_save_eval_stats,
                                gpus,
                                gpu_memory_limit,
                                allow_parallel_threads,
                                backend,
                                debug,
                            )
                        else:
                            logger.warning("Skipping evaluation as no model checkpoints were available")
                    else:
                        logger.warning("Skipping evaluation as no validation set was provided")

            ordered_trials = [TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials]
        else:
            logger.warning("No trials reported results; check if time budget lower than epoch latency")
            ordered_trials = []

        return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)
Ejemplo n.º 17
0
    def execute(
        self,
        config,
        dataset=None,
        training_set=None,
        validation_set=None,
        test_set=None,
        training_set_metadata=None,
        data_format=None,
        experiment_name="hyperopt",
        model_name="run",
        resume=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=True,
        skip_save_unprocessed_output=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        output_directory="results",
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        callbacks=None,
        backend=None,
        random_seed=default_random_seed,
        debug=False,
        hyperopt_log_verbosity=3,
        features_eligible_for_shared_params=None,
        **kwargs,
    ) -> RayTuneResults:
        if isinstance(dataset, str) and not has_remote_protocol(
                dataset) and not os.path.isabs(dataset):
            dataset = os.path.abspath(dataset)

        if isinstance(backend, str):
            backend = initialize_backend(backend)

        if gpus is not None:
            raise ValueError(
                "Parameter `gpus` is not supported when using Ray Tune. "
                "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your "
                "hyperopt config.")

        if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1:
            # Enforce fractional GPU utilization
            gpu_memory_limit = self.gpu_resources_per_trial

        hyperopt_dict = dict(
            config=config,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            experiment_name=experiment_name,
            model_name=model_name,
            eval_split=self.split,
            skip_save_training_description=skip_save_training_description,
            skip_save_training_statistics=skip_save_training_statistics,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            skip_save_processed_input=skip_save_processed_input,
            skip_save_unprocessed_output=skip_save_unprocessed_output,
            skip_save_predictions=skip_save_predictions,
            skip_save_eval_stats=skip_save_eval_stats,
            output_directory=output_directory,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
            backend=backend,
            random_seed=random_seed,
            debug=debug,
        )

        mode = "min" if self.goal != MAXIMIZE else "max"
        metric = "metric_score"
        # if random seed not set, use Ludwig seed
        self.search_algorithm.check_for_random_seed(random_seed)
        if self.search_algorithm.search_alg_dict is not None:
            if TYPE not in self.search_algorithm.search_alg_dict:
                candiate_search_algs = [
                    search_alg for search_alg in SEARCH_ALG_IMPORT.keys()
                ]
                logger.warning(
                    "WARNING: search_alg type parameter missing, using 'variant_generator' as default. "
                    f"These are possible values for the type parameter: {candiate_search_algs}."
                )
                search_alg = None
            else:
                search_alg_type = self.search_algorithm.search_alg_dict[TYPE]
                search_alg = tune.create_searcher(
                    search_alg_type,
                    metric=metric,
                    mode=mode,
                    **self.search_algorithm.search_alg_dict)
        else:
            search_alg = None

        if self.max_concurrent_trials:
            assert (
                self.max_concurrent_trials > 0
            ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}"
            if isinstance(search_alg,
                          BasicVariantGenerator) or search_alg is None:
                search_alg = BasicVariantGenerator(
                    max_concurrent=self.max_concurrent_trials)
            elif isinstance(search_alg, ConcurrencyLimiter):
                raise ValueError(
                    "You have specified `max_concurrent_trials`, but the search "
                    "algorithm is already a `ConcurrencyLimiter`. FIX THIS "
                    "by setting `max_concurrent_trials=None`.")
            else:
                search_alg = ConcurrencyLimiter(
                    search_alg, max_concurrent=self.max_concurrent_trials)

        resources_per_trial = {
            "cpu": self._cpu_resources_per_trial_non_none,
            "gpu": self._gpu_resources_per_trial_non_none,
        }

        def run_experiment_trial(config,
                                 local_hyperopt_dict,
                                 checkpoint_dir=None):
            return self._run_experiment(
                config,
                checkpoint_dir,
                local_hyperopt_dict,
                self.decode_ctx,
                features_eligible_for_shared_params,
                _is_ray_backend(backend),
            )

        tune_config = {}
        tune_callbacks = []
        for callback in callbacks or []:
            run_experiment_trial, tune_config = callback.prepare_ray_tune(
                run_experiment_trial,
                tune_config,
                tune_callbacks,
            )

        if _is_ray_backend(backend):
            # for now, we do not do distributed training on cpu (until spread scheduling is implemented for Ray Train)
            # but we do want to enable it when GPUs are specified
            resources_per_trial = PlacementGroupFactory(
                [{}] + ([{
                    "CPU": 0,
                    "GPU": 1
                }] * self._gpu_resources_per_trial_non_none) if self.
                _gpu_resources_per_trial_non_none else [{}] +
                [{
                    "CPU": self._cpu_resources_per_trial_non_none
                }])

        if has_remote_protocol(output_directory):
            run_experiment_trial = tune.durable(run_experiment_trial)
            self.sync_config = tune.SyncConfig(sync_to_driver=False,
                                               upload_dir=output_directory)
            if _ray_114:
                self.sync_client = get_node_to_storage_syncer(
                    SyncConfig(upload_dir=output_directory))
            else:
                self.sync_client = get_cloud_sync_client(output_directory)
            output_directory = None
        elif self.kubernetes_namespace:
            from ray.tune.integration.kubernetes import KubernetesSyncClient, NamespacedKubernetesSyncer

            self.sync_config = tune.SyncConfig(
                sync_to_driver=NamespacedKubernetesSyncer(
                    self.kubernetes_namespace))
            self.sync_client = KubernetesSyncClient(self.kubernetes_namespace)

        run_experiment_trial_params = tune.with_parameters(
            run_experiment_trial, local_hyperopt_dict=hyperopt_dict)
        register_trainable(
            f"trainable_func_f{hash_dict(config).decode('ascii')}",
            run_experiment_trial_params)

        # Note that resume="AUTO" will attempt to resume the experiment if possible, and
        # otherwise will start a new experiment:
        # https://docs.ray.io/en/latest/tune/tutorials/tune-stopping.html
        should_resume = "AUTO" if resume is None else resume

        try:
            analysis = tune.run(
                f"trainable_func_f{hash_dict(config).decode('ascii')}",
                name=experiment_name,
                config={
                    **self.search_space,
                    **tune_config,
                },
                scheduler=self.scheduler,
                search_alg=search_alg,
                num_samples=self.num_samples,
                keep_checkpoints_num=1,
                max_failures=1,  # retry a trial failure once
                resources_per_trial=resources_per_trial,
                time_budget_s=self.time_budget_s,
                sync_config=self.sync_config,
                local_dir=output_directory,
                metric=metric,
                mode=mode,
                trial_name_creator=lambda trial: f"trial_{trial.trial_id}",
                trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}",
                callbacks=tune_callbacks,
                stop=CallbackStopper(callbacks),
                verbose=hyperopt_log_verbosity,
                resume=should_resume,
                log_to_file=True,
            )
        except Exception as e:
            # Explicitly raise a RuntimeError if an error is encountered during a Ray trial.
            # NOTE: Cascading the exception with "raise _ from e" still results in hanging.
            raise RuntimeError(f"Encountered Ray Tune error: {e}")

        if "metric_score" in analysis.results_df.columns:
            ordered_trials = analysis.results_df.sort_values(
                "metric_score", ascending=self.goal != MAXIMIZE)

            # Catch nans in edge case where the trial doesn't complete
            temp_ordered_trials = []
            for kwargs in ordered_trials.to_dict(orient="records"):
                for key in ["parameters", "training_stats", "eval_stats"]:
                    if isinstance(kwargs[key], float):
                        kwargs[key] = {}
                temp_ordered_trials.append(kwargs)

            # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate
            # tune.report call(s) but were terminated before reporting eval_stats from post-train
            # evaluation (e.g., trial stopped due to time budget or relatively poor performance.)
            # For any such trials, run model evaluation for the best model in that trial & record
            # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json.
            for trial in temp_ordered_trials:
                if trial["eval_stats"] == "{}" and trial[
                        "training_stats"] != "{}":
                    # Evaluate the best model on the eval_split, which is validation_set
                    if validation_set is not None and validation_set.size > 0:
                        trial_path = trial["trial_dir"]
                        best_model_path = self._get_best_model_path(
                            trial_path, analysis)
                        if best_model_path is not None:
                            self._evaluate_best_model(
                                trial,
                                trial_path,
                                best_model_path,
                                validation_set,
                                data_format,
                                skip_save_unprocessed_output,
                                skip_save_predictions,
                                skip_save_eval_stats,
                                gpus,
                                gpu_memory_limit,
                                allow_parallel_threads,
                                backend,
                                debug,
                            )
                        else:
                            logger.warning(
                                "Skipping evaluation as no model checkpoints were available"
                            )
                    else:
                        logger.warning(
                            "Skipping evaluation as no validation set was provided"
                        )

            ordered_trials = [
                TrialResults.from_dict(load_json_values(kwargs))
                for kwargs in temp_ordered_trials
            ]
        else:
            logger.warning(
                "No trials reported results; check if time budget lower than epoch latency"
            )
            ordered_trials = []

        return RayTuneResults(ordered_trials=ordered_trials,
                              experiment_analysis=analysis)