コード例 #1
0
def experiment_cli(model_definition,
                   model_definition_file=None,
                   dataset=None,
                   training_set=None,
                   validation_set=None,
                   test_set=None,
                   data_format=None,
                   training_set_metadata=None,
                   experiment_name='experiment',
                   model_name='run',
                   model_load_path=None,
                   model_resume_path=None,
                   skip_save_training_description=False,
                   skip_save_training_statistics=False,
                   skip_save_model=False,
                   skip_save_progress=False,
                   skip_save_log=False,
                   skip_save_processed_input=False,
                   skip_save_unprocessed_output=False,
                   skip_save_predictions=False,
                   skip_save_eval_stats=False,
                   skip_collect_predictions=False,
                   skip_collect_overall_stats=False,
                   output_directory='results',
                   gpus=None,
                   gpu_memory_limit=None,
                   allow_parallel_threads=True,
                   use_horovod=None,
                   random_seed=default_random_seed,
                   debug=False,
                   logging_level=logging.INFO,
                   **kwargs):
    """Trains a model on a dataset's training and validation splits and
    uses it to predict on the test split.
    It saves the trained model and the statistics of training and testing.

    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param model_definition_file: The file that specifies the model definition.
           It is a yaml file.
    :type model_definition_file: filepath (str)
    :param dataset: Source containing the entire dataset.
           If it has a split column, it will be used for splitting (0: train,
           1: validation, 2: test), otherwise the dataset will be randomly split.
    :type dataset: Str, Dictionary, DataFrame
    :param training_set: Source containing training data.
    :type training_set: Str, Dictionary, DataFrame
    :param validation_set: Source containing validation data.
    :type validation_set: Str, Dictionary, DataFrame
    :param test_set: Source containing test data.
    :type test_set: Str, Dictionary, DataFrame
    :param training_set_metadata: Metadata JSON file or loaded metadata.
           Intermediate preprocess structure containing the mappings of the input
           CSV created the first time a CSV file is used in the same
           directory with the same name and a '.json' extension.
    :type training_set_metadata: Str, Dictionary
    :param data_format: Format to interpret data sources. Will be inferred
           automatically if not specified.
    :type data_format: Str
    :param experiment_name: The name for the experiment.
    :type experiment_name: Str
    :param model_name: Name of the model that is being used.
    :type model_name: Str
    :param model_load_path: If this is specified the loaded model will be used
           as initialization (useful for transfer learning).
    :type model_load_path: filepath (str)
    :param model_resume_path: Resumes training of the model from the path
           specified. The difference with model_load_path is that also training
           statistics like the current epoch and the loss and performance so
           far are also resumed effectively continuing a previously interrupted
           training process.
    :type model_resume_path: filepath (str)
    :param skip_save_training_description: Disables saving
           the description JSON file.
    :type skip_save_training_description: Boolean
    :param skip_save_training_statistics: Disables saving
           training statistics JSON file.
    :type skip_save_training_statistics: Boolean
    :param skip_save_model: Disables
               saving model weights and hyperparameters each time the model
           improves. By default Ludwig saves model weights after each epoch
           the validation metric improves, but if the model is really big
           that can be time consuming if you do not want to keep
           the weights and just find out what performance can a model get
           with a set of hyperparameters, use this parameter to skip it,
           but the model will not be loadable later on.
    :type skip_save_model: Boolean
    :param skip_save_progress: Disables saving
           progress each epoch. By default Ludwig saves weights and stats
           after each epoch for enabling resuming of training, but if
           the model is really big that can be time consuming and will uses
           twice as much space, use this parameter to skip it, but training
           cannot be resumed later on.
    :type skip_save_progress: Boolean
    :param skip_save_log: Disables saving TensorBoard
           logs. By default Ludwig saves logs for the TensorBoard, but if it
           is not needed turning it off can slightly increase the
           overall speed..
    :type skip_save_log: Boolean
    :param skip_save_processed_input: If a CSV dataset is provided it is
           preprocessed and then saved as an hdf5 and json to avoid running
           the preprocessing again. If this parameter is False,
           the hdf5 and json file are not saved.
    :type skip_save_processed_input: Boolean
    :param skip_save_unprocessed_output: By default predictions and
           their probabilities are saved in both raw unprocessed numpy files
           containing tensors and as postprocessed CSV files
           (one for each output feature). If this parameter is True,
           only the CSV ones are saved and the numpy ones are skipped.
    :type skip_save_unprocessed_output: Boolean
    :param skip_save_predictions: skips saving test predictions CSV files
    :type skip_save_predictions: Boolean
    :param skip_save_eval_stats: skips saving test statistics JSON file
    :type skip_save_eval_stats: Boolean
    :param skip_collect_predictions: skips collecting post-processed predictions during eval.
    :type skip_collect_predictions: Boolean
    :param skip_collect_overall_stats: skips collecting overall stats during eval.
    :type skip_collect_overall_stats: Boolean
    :param output_directory: The directory that will contain the training
           statistics, the saved model and the training progress files.
    :type output_directory: filepath (str)
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_memory_limit: maximum memory in MB to allocate per GPU device.
    :type gpu_memory_limit: Integer
    :param allow_parallel_threads: allow TensorFlow to use multithreading parallelism
           to improve performance at the cost of determinism.
    :type allow_parallel_threads: Boolean
    :param use_horovod: Flag for using horovod
    :type use_horovod: Boolean
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean
    :param logging_level: Log level to send to stderr.
    :type logging_level: int
    """
    set_on_master(use_horovod)

    model_definition = check_which_model_definition(model_definition,
                                                    model_definition_file)

    if model_load_path:
        model = LudwigModel.load(model_load_path)
    else:
        model = LudwigModel(
            model_definition=model_definition,
            logging_level=logging_level,
            use_horovod=use_horovod,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
        )
    (test_results, train_stats, preprocessed_data,
     output_directory) = model.experiment(
         dataset=dataset,
         training_set=training_set,
         validation_set=validation_set,
         test_set=test_set,
         training_set_metadata=training_set_metadata,
         data_format=data_format,
         experiment_name=experiment_name,
         model_name=model_name,
         model_resume_path=model_resume_path,
         skip_save_training_description=skip_save_training_description,
         skip_save_training_statistics=skip_save_training_statistics,
         skip_save_model=skip_save_model,
         skip_save_progress=skip_save_progress,
         skip_save_log=skip_save_log,
         skip_save_processed_input=skip_save_processed_input,
         skip_save_unprocessed_output=skip_save_unprocessed_output,
         skip_save_predictions=skip_save_predictions,
         skip_save_eval_stats=skip_save_eval_stats,
         skip_collect_predictions=skip_collect_predictions,
         skip_collect_overall_stats=skip_collect_overall_stats,
         output_directory=output_directory,
         random_seed=random_seed,
         debug=debug,
     )

    return model, test_results, train_stats, preprocessed_data, output_directory
コード例 #2
0
ファイル: hyperopt_cli.py プロジェクト: zanussbaum/ludwig
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description="This script searches for optimal Hyperparameters",
        prog="ludwig hyperopt",
        usage="%(prog)s [options]",
    )

    # -------------------
    # Hyperopt parameters
    # -------------------
    parser.add_argument(
        "-sshs",
        "--skip_save_hyperopt_statistics",
        help="skips saving hyperopt statistics file",
        action="store_true",
        default=False,
    )

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument(
        "--output_directory",
        type=str,
        default="results",
        help="directory that contains the results",
    )
    parser.add_argument("--experiment_name",
                        type=str,
                        default="hyperopt",
                        help="experiment name")
    parser.add_argument("--model_name",
                        type=str,
                        default="run",
                        help="name for the model")

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        '--dataset',
        help='input data file path. '
        'If it has a split column, it will be used for splitting '
        '(0: train, 1: validation, 2: test), '
        'otherwise the dataset will be randomly split')
    parser.add_argument('--training_set', help='input train data file path')
    parser.add_argument('--validation_set',
                        help='input validation data file path')
    parser.add_argument('--test_set', help='input test data file path')

    parser.add_argument(
        '--training_set_metadata',
        help='input metadata JSON file path. An intermediate preprocessed file '
        'containing the mappings of the input file created '
        'the first time a file is used, in the same directory '
        'with the same name and a .json extension')

    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html'
                            'tables', 'json', 'jsonl', 'parquet', 'pickle',
                            'sas', 'spss', 'stata', 'tsv'
                        ])

    parser.add_argument(
        "-sspi",
        "--skip_save_processed_input",
        help="skips saving intermediate HDF5 and JSON files",
        action="store_true",
        default=False,
    )

    # ----------------
    # Model parameters
    # ----------------
    config = parser.add_mutually_exclusive_group(required=True)
    config.add_argument("-c", "--config", type=yaml.safe_load, help="config")
    config.add_argument(
        "-cf",
        "--config_file",
        help="YAML file describing the model. Ignores --model_hyperparameters",
    )

    parser.add_argument(
        "-mlp",
        "--model_load_path",
        help="path of a pretrained model to load as initialization",
    )
    parser.add_argument(
        "-mrp",
        "--model_resume_path",
        help="path of the model directory to resume training of",
    )
    parser.add_argument(
        "-sstd",
        "--skip_save_training_description",
        action="store_true",
        default=False,
        help="disables saving the description JSON file",
    )
    parser.add_argument(
        "-ssts",
        "--skip_save_training_statistics",
        action="store_true",
        default=False,
        help="disables saving training statistics JSON file",
    )
    parser.add_argument(
        "-ssm",
        "--skip_save_model",
        action="store_true",
        default=False,
        help="disables saving weights each time the model improves. "
        "By default Ludwig saves  weights after each epoch "
        "the validation metric imrpvoes, but  if the model is really big "
        "that can be time consuming. If you do not want to keep "
        "the weights and just find out what performance a model can get "
        "with a set of hyperparameters, use this parameter to skip it",
    )
    parser.add_argument(
        "-ssp",
        "--skip_save_progress",
        action="store_true",
        default=False,
        help="disables saving weights after each epoch. By default ludwig saves "
        "weights after each epoch for enabling resuming of training, but "
        "if the model is really big that can be time consuming and will "
        "save twice as much space, use this parameter to skip it",
    )
    parser.add_argument(
        "-ssl",
        "--skip_save_log",
        action="store_true",
        default=False,
        help="disables saving TensorBoard logs. By default Ludwig saves "
        "logs for the TensorBoard, but if it is not needed turning it off "
        "can slightly increase the overall speed",
    )

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        "-rs",
        "--random_seed",
        type=int,
        default=42,
        help="a random seed that is going to be used anywhere there is a call "
        "to a random number generator: data splitting, parameter "
        "initialization and training set shuffling",
    )
    parser.add_argument("-g",
                        "--gpus",
                        nargs="+",
                        type=int,
                        default=None,
                        help="list of gpus to use")
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        "-uh",
        "--use_horovod",
        action="store_true",
        default=False,
        help="uses horovod for distributed training",
    )
    parser.add_argument(
        "-dbg",
        "--debug",
        action="store_true",
        default=False,
        help="enables debugging mode",
    )
    parser.add_argument(
        "-l",
        "--logging_level",
        default="info",
        help="the level of logging to use",
        choices=["critical", "error", "warning", "info", "debug", "notset"],
    )

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.hyperopt')

    set_on_master(args.use_horovod)

    if is_on_master():
        print_ludwig("Hyperopt", LUDWIG_VERSION)

    hyperopt_cli(**vars(args))
コード例 #3
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script trains and tests a model',
        prog='ludwig experiment',
        usage='%(prog)s [options]')

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument('--output_directory',
                        type=str,
                        default='results',
                        help='directory that contains the results')
    parser.add_argument('--experiment_name',
                        type=str,
                        default='experiment',
                        help='experiment name')
    parser.add_argument('--model_name',
                        type=str,
                        default='run',
                        help='name for the model')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        '--dataset',
        help='input data file path. '
        'If it has a split column, it will be used for splitting '
        '(0: train, 1: validation, 2: test), '
        'otherwise the dataset will be randomly split')
    parser.add_argument('--training_set', help='input train data file path')
    parser.add_argument('--validation_set',
                        help='input validation data file path')
    parser.add_argument('--test_set', help='input test data file path')

    parser.add_argument(
        '--training_set_metadata',
        help='input metadata JSON file path. An intermediate preprocess file '
        'containing the mappings of the input file created '
        'the first time a file is used, in the same directory '
        'with the same name and a .json extension')

    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=['auto', 'csv', 'hdf5'])

    parser.add_argument('-sspi',
                        '--skip_save_processed_input',
                        help='skips saving intermediate HDF5 and JSON files',
                        action='store_true',
                        default=False)
    parser.add_argument('-ssuo',
                        '--skip_save_unprocessed_output',
                        help='skips saving intermediate NPY output files',
                        action='store_true',
                        default=False)

    # -----------------
    # K-fold parameters
    # -----------------
    parser.add_argument(
        '-kf',
        '--k_fold',
        type=int,
        default=None,
        help='number of folds for a k-fold cross validation run ')
    parser.add_argument(
        '-skfsi',
        '--skip_save_k_fold_split_indices',
        action='store_true',
        default=False,
        help='disables saving indices generated to split training data set '
        'for the k-fold cross validation run, but if it is not needed '
        'turning it off can slightly increase the overall speed')

    # ----------------
    # Model parameters
    # ----------------
    model_definition = parser.add_mutually_exclusive_group(required=True)
    model_definition.add_argument('-md',
                                  '--model_definition',
                                  type=yaml.safe_load,
                                  help='model definition')
    model_definition.add_argument(
        '-mdf',
        '--model_definition_file',
        help='YAML file describing the model. Ignores --model_hyperparameters')

    parser.add_argument(
        '-mlp',
        '--model_load_path',
        help='path of a pretrained model to load as initialization')
    parser.add_argument(
        '-mrp',
        '--model_resume_path',
        help='path of a the model directory to resume training of')
    parser.add_argument('-sstd',
                        '--skip_save_training_description',
                        action='store_true',
                        default=False,
                        help='disables saving the description JSON file')
    parser.add_argument('-ssts',
                        '--skip_save_training_statistics',
                        action='store_true',
                        default=False,
                        help='disables saving training statistics JSON file')
    parser.add_argument('-sstp',
                        '--skip_save_predictions',
                        help='skips saving test predictions CSV files',
                        action='store_true',
                        default=False)
    parser.add_argument('-sstes',
                        '--skip_save_eval_stats',
                        help='skips saving eval statistics JSON file',
                        action='store_true',
                        default=False)
    parser.add_argument(
        '-ssm',
        '--skip_save_model',
        action='store_true',
        default=False,
        help='disables saving model weights and hyperparameters each time '
        'the model improves. '
        'By default Ludwig saves model weights after each epoch '
        'the validation metric imprvoes, but if the model is really big '
        'that can be time consuming if you do not want to keep '
        'the weights and just find out what performance can a model get '
        'with a set of hyperparameters, use this parameter to skip it,'
        'but the model will not be loadable later on')
    parser.add_argument(
        '-ssp',
        '--skip_save_progress',
        action='store_true',
        default=False,
        help='disables saving progress each epoch. By default Ludwig saves '
        'weights and stats  after each epoch for enabling resuming '
        'of training, but if the model is really big that can be '
        'time consuming and will uses twice as much space, use '
        'this parameter to skip it, but training cannot be resumed '
        'later on')
    parser.add_argument(
        '-ssl',
        '--skip_save_log',
        action='store_true',
        default=False,
        help='disables saving TensorBoard logs. By default Ludwig saves '
        'logs for the TensorBoard, but if it is not needed turning it off '
        'can slightly increase the overall speed')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        '-rs',
        '--random_seed',
        type=int,
        default=42,
        help='a random seed that is going to be used anywhere there is a call '
        'to a random number generator: data splitting, parameter '
        'initialization and training set shuffling')
    parser.add_argument('-g',
                        '--gpus',
                        nargs='+',
                        type=int,
                        default=None,
                        help='list of GPUs to use')
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        '-dpt',
        '--disable_parallel_threads',
        action='store_false',
        dest='allow_parallel_threads',
        help='disable TensorFlow from using multithreading for reproducibility'
    )
    parser.add_argument('-uh',
                        '--use_horovod',
                        action='store_true',
                        default=None,
                        help='uses horovod for distributed training')
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.experiment')

    set_on_master(args.use_horovod)

    if is_on_master():
        print_ludwig('Experiment', LUDWIG_VERSION)

    if args.k_fold is None:
        experiment_cli(**vars(args))
    else:
        kfold_cross_validate_cli(**vars(args))
コード例 #4
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script loads a pretrained model '
        'and evaluates its performance by comparing'
        'its predictions with ground truth.',
        prog='ludwig evaluate',
        usage='%(prog)s [options]')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument('--dataset',
                        help='input data file path',
                        required=True)
    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html'
                            'tables', 'json', 'jsonl', 'parquet', 'pickle',
                            'sas', 'spss', 'stata', 'tsv'
                        ])
    parser.add_argument('-s',
                        '--split',
                        default=FULL,
                        choices=[TRAINING, VALIDATION, TEST, FULL],
                        help='the split to test the model on')

    # ----------------
    # Model parameters
    # ----------------
    parser.add_argument('-m',
                        '--model_path',
                        help='model to load',
                        required=True)

    # -------------------------
    # Output results parameters
    # -------------------------
    parser.add_argument('-od',
                        '--output_directory',
                        type=str,
                        default='results',
                        help='directory that contains the results')
    parser.add_argument('-ssuo',
                        '--skip_save_unprocessed_output',
                        help='skips saving intermediate NPY output files',
                        action='store_true',
                        default=False)
    parser.add_argument('-sses',
                        '--skip_save_eval_stats',
                        help='skips saving intermediate JSON eval statistics',
                        action='store_true',
                        default=False)
    parser.add_argument('-scp',
                        '--skip_collect_predictions',
                        help='skips collecting predictions',
                        action='store_true',
                        default=False)
    parser.add_argument('-scos',
                        '--skip_collect_overall_stats',
                        help='skips collecting overall stats',
                        action='store_true',
                        default=False)

    # ------------------
    # Generic parameters
    # ------------------
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=128,
                        help='size of batches')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument('-g',
                        '--gpus',
                        type=int,
                        default=0,
                        help='list of gpu to use')
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        '-dpt',
        '--disable_parallel_threads',
        action='store_false',
        dest='allow_parallel_threads',
        help='disable TensorFlow from using multithreading for reproducibility'
    )
    parser.add_argument('-uh',
                        '--use_horovod',
                        action='store_true',
                        default=None,
                        help='uses horovod for distributed training')
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    args = parser.parse_args(sys_argv)
    args.evaluate_performance = True

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.test_performance')

    set_on_master(args.use_horovod)

    if is_on_master():
        print_ludwig('Test', LUDWIG_VERSION)
        logger.info('Dataset path: {}'.format(args.dataset))
        logger.info('Model path: {}'.format(args.model_path))
        logger.info('')

    evaluate_cli(**vars(args))
コード例 #5
0
ファイル: experiment.py プロジェクト: sycomix/ludwig
def experiment_cli(
        config: dict,
        config_file: str = None,
        dataset: Union[str, dict, pd.DataFrame] = None,
        training_set: Union[str, dict, pd.DataFrame] = None,
        validation_set: Union[str, dict, pd.DataFrame] = None,
        test_set: Union[str, dict, pd.DataFrame] = None,
        training_set_metadata: Union[str, dict] = None,
        data_format: str = None,
        experiment_name: str = 'experiment',
        model_name: str = 'run',
        model_load_path: str = None,
        model_resume_path: str = None,
        eval_split: str = TEST,
        skip_save_training_description: bool = False,
        skip_save_training_statistics: bool = False,
        skip_save_model: bool = False,
        skip_save_progress: bool = False,
        skip_save_log: bool = False,
        skip_save_processed_input: bool = False,
        skip_save_unprocessed_output: bool = False,
        skip_save_predictions: bool = False,
        skip_save_eval_stats: bool = False,
        skip_collect_predictions: bool = False,
        skip_collect_overall_stats: bool = False,
        output_directory: str = 'results',
        gpus: Union[str, int, List[int]] = None,
        gpu_memory_limit: int = None,
        allow_parallel_threads: bool = True,
        use_horovod: bool = None,
        random_seed: int = default_random_seed,
        debug: bool = False,
        logging_level: int = logging.INFO,
        **kwargs
):
    """Trains a model on a dataset's training and validation splits and
    uses it to predict on the test split.
    It saves the trained model and the statistics of training and testing.

    # Inputs

    :param config: (dict) config which defines the different
        parameters of the model, features, preprocessing and training.
    :param config_file: (str, default: `None`) the filepath string
        that specifies the config.  It is a yaml file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the experiment.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocess
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param model_load_path: (str, default: `None`) if this is specified the
        loaded model will be used as initialization
        (useful for transfer learning).
    :param model_resume_path: (str, default: `None`) resumes training of
        the model from the path specified. The config is restored.
        In addition to config, training statistics and loss for
        epoch and the state of the optimizer are restored such that
        training can be effectively continued from a previously interrupted
        training process.
    :param eval_split: (str, default: `test`) split on which
        to perform evaluation. Valid values are `training`, `validation`
        and `test`.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming if you do not want to keep
        the weights and just find out what performance can a model get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
   :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files
    :param skip_save_eval_stats: (bool, default: `False`) skips saving test
        statistics JSON file
   :param skip_collect_predictions: (bool, default: `False`) skips
        collecting post-processed predictions during eval.
    :param skip_collect_overall_stats: (bool, default: `False`) skips
        collecting overall stats during eval.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param use_horovod: (bool, default: `None`) flag for using horovod.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.
    :param logging_level: (int) Log level that will be sent to stderr.

    # Return
    :return: (Tuple[LudwigModel, dict, dict, tuple, str)) `(model, evaluation_statistics, training_statistics, preprocessed_data, output_directory)`
        `model` LudwigModel instance
        `evaluation_statistics` dictionary with evaluation performance
            statistics on the test_set,
        `training_statistics` is a dictionary of training statistics for
            each output
        feature containing loss and metrics values for each epoch,
        `preprocessed_data` tuple containing preprocessed
        `(training_set, validation_set, test_set)`, `output_directory`
        filepath string to where results are stored.

    """
    set_on_master(use_horovod)

    config = check_which_config(config,
                                config_file)

    if model_load_path:
        model = LudwigModel.load(model_load_path)
    else:
        model = LudwigModel(
            config=config,
            logging_level=logging_level,
            use_horovod=use_horovod,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
        )
    (
        eval_stats,
        train_stats,
        preprocessed_data,
        output_directory
    ) = model.experiment(
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        model_resume_path=model_resume_path,
        eval_split=eval_split,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        skip_collect_predictions=skip_collect_predictions,
        skip_collect_overall_stats=skip_collect_overall_stats,
        output_directory=output_directory,
        random_seed=random_seed,
        debug=debug,
    )

    return model, eval_stats, train_stats, preprocessed_data, output_directory
コード例 #6
0
ファイル: predict.py プロジェクト: prmrreddy/ludwig
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script loads a pretrained model '
        'and uses it to predict',
        prog='ludwig predict',
        usage='%(prog)s [options]')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument('--dataset',
                        help='input data file path',
                        required=True)
    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=['auto', 'csv', 'hdf5'])

    # ----------------
    # Model parameters
    # ----------------
    parser.add_argument('-m',
                        '--model_path',
                        help='model to load',
                        required=True)

    # -------------------------
    # Output results parameters
    # -------------------------
    parser.add_argument('-od',
                        '--output_directory',
                        type=str,
                        default='results',
                        help='directory that contains the results')
    parser.add_argument('-ssuo',
                        '--skip_save_unprocessed_output',
                        help='skips saving intermediate NPY output files',
                        action='store_true',
                        default=False)
    parser.add_argument('-sstp',
                        '--skip_save_predictions',
                        help='skips saving predictions CSV files',
                        action='store_true',
                        default=False)

    # ------------------
    # Generic parameters
    # ------------------
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=128,
                        help='size of batches')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument('-g',
                        '--gpus',
                        type=int,
                        default=0,
                        help='list of gpu to use')
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        '-dpt',
        '--disable_parallel_threads',
        action='store_false',
        dest='allow_parallel_threads',
        help='disable TensorFlow from using multithreading for reproducibility'
    )
    parser.add_argument('-uh',
                        '--use_horovod',
                        action='store_true',
                        default=None,
                        help='uses horovod for distributed training')
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.predict')

    set_on_master(args.use_horovod)

    if is_on_master():
        print_ludwig('Predict', LUDWIG_VERSION)
        logger.info('Dataset path: {}'.format(args.dataset))
        logger.info('Model path: {}'.format(args.model_path))
        logger.info('')

    predict_cli(**vars(args))
コード例 #7
0
def kfold_cross_validate(
        num_folds,
        model_definition,
        data_csv=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        skip_collect_predictions=False,
        skip_collect_overall_stats=False,
        output_directory='results',
        random_seed=default_random_seed,
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        use_horovod=None,
        logging_level=logging.INFO,
        debug=False,
        **kwargs
):
    """Performs k-fold cross validation and returns result data structures.

    # Inputs

    :param num_folds: (int) number of folds to create for the cross-validation
    :param model_definition: (dict, default: None) a dictionary containing
           information needed to build a model. Refer to the
           [User Guide](http://ludwig.ai/user_guide/#model-definition)
           for details.
    :param model_definition_file: (string, optional, default: `None`) path to
           a YAML file containing the model definition. If available it will be
           used instead of the model_definition dict.
    :param data_csv: (dataframe, default: None)
    :param data_csv: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.

    # Return

    :return: (tuple(kfold_cv_stats, kfold_split_indices), dict) a tuple of
            dictionaries `kfold_cv_stats`: contains metrics from cv run.
             `kfold_split_indices`: indices to split training data into
             training fold and test fold.
    """
    set_on_master(use_horovod)

    # check for k_fold
    if num_folds is None:
        raise ValueError(
            'k_fold parameter must be specified'
        )

    logger.info('starting {:d}-fold cross validation'.format(num_folds))

    # create output_directory if not available
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)

    # read in data to split for the folds
    data_df = pd.read_csv(data_csv)

    # place each fold in a separate directory
    data_dir = os.path.dirname(data_csv)

    kfold_cv_stats = {}
    kfold_split_indices = {}

    for train_indices, test_indices, fold_num in \
            generate_kfold_splits(data_df, num_folds, random_seed):
        with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name:
            curr_train_df = data_df.iloc[train_indices]
            curr_test_df = data_df.iloc[test_indices]

            kfold_split_indices['fold_' + str(fold_num)] = {
                'training_indices': train_indices,
                'test_indices': test_indices
            }

            # train and validate model on this fold
            logger.info("training on fold {:d}".format(fold_num))

            model = LudwigModel(
                model_definition=model_definition,
                logging_level=logging_level,
                use_horovod=use_horovod,
                gpus=gpus,
                gpu_memory_limit=gpu_memory_limit,
                allow_parallel_threads=allow_parallel_threads,
            )
            (
                test_results,
                train_stats,
                preprocessed_data,
                output_directory
            ) = model.experiment(
                training_set=curr_train_df,
                test_set=curr_test_df,
                experiment_name='cross_validation',
                model_name='fold_' + str(fold_num),
                skip_save_training_description=skip_save_training_description,
                skip_save_training_statistics=skip_save_training_statistics,
                skip_save_model=skip_save_model,
                skip_save_progress=skip_save_progress,
                skip_save_log=skip_save_log,
                skip_save_processed_input=skip_save_processed_input,
                skip_save_predictions=skip_save_predictions,
                skip_save_eval_stats=skip_save_eval_stats,
                skip_collect_predictions=skip_collect_predictions,
                skip_collect_overall_stats=skip_collect_overall_stats,
                output_directory=os.path.join(temp_dir_name, 'results'),
                random_seed=random_seed,
                debug=debug,
            )

            # augment the training statistics with scoring metric from
            # the hold out fold
            train_stats['fold_test_results'] = test_results

            # collect training statistics for this fold
            kfold_cv_stats['fold_' + str(fold_num)] = train_stats

    # consolidate raw fold metrics across all folds
    raw_kfold_stats = {}
    for fold_name in kfold_cv_stats:
        curr_fold_test_results = kfold_cv_stats[fold_name]['fold_test_results']
        for of_name in curr_fold_test_results:
            if of_name not in raw_kfold_stats:
                raw_kfold_stats[of_name] = {}
            fold_test_results_of = curr_fold_test_results[of_name]

            for metric in fold_test_results_of:
                if metric not in {
                    'predictions',
                    'probabilities',
                    'confusion_matrix',
                    'overall_stats',
                    'per_class_stats',
                    'roc_curve',
                    'precision_recall_curve'
                }:
                    if metric not in raw_kfold_stats[of_name]:
                        raw_kfold_stats[of_name][metric] = []
                    raw_kfold_stats[of_name][metric].append(
                        fold_test_results_of[metric]
                    )

    # calculate overall kfold statistics
    overall_kfold_stats = {}
    for of_name in raw_kfold_stats:
        overall_kfold_stats[of_name] = {}
        for metric in raw_kfold_stats[of_name]:
            mean = np.mean(raw_kfold_stats[of_name][metric])
            std = np.std(raw_kfold_stats[of_name][metric])
            overall_kfold_stats[of_name][metric + '_mean'] = mean
            overall_kfold_stats[of_name][metric + '_std'] = std

    kfold_cv_stats['overall'] = overall_kfold_stats

    logger.info('completed {:d}-fold cross validation'.format(num_folds))

    return kfold_cv_stats, kfold_split_indices