def cli_render_config(sys_argv): parser = argparse.ArgumentParser( description="This script renders the full config from a user config.", prog="ludwig render_config", usage="%(prog)s [options]", ) parser.add_argument( "-c", "--config", type=load_config_from_str, help="input user YAML config path", ) parser.add_argument( "-o", "--output", type=str, help="output rendered YAML config path", required=False, ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("render_config", *sys_argv) print_ludwig("Render Config", LUDWIG_VERSION) render_config(**vars(args))
def cli_export_triton(sys_argv): parser = argparse.ArgumentParser( description="This script loads a pretrained model " "and saves it as torchscript for Triton.", prog="ludwig export_neuropod", usage="%(prog)s [options]", ) # ---------------- # Model parameters # ---------------- parser.add_argument("-m", "--model_path", help="model to load", required=True) parser.add_argument("-mn", "--model_name", help="model name", default="ludwig_model") parser.add_argument("-mv", "--model_version", type=int, help="model version", default=1) # ----------------- # Output parameters # ----------------- parser.add_argument("-od", "--output_path", type=str, help="path where to save the export model", required=True) # ------------------ # Runtime parameters # ------------------ parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("export_triton", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.export") print_ludwig("Export Triton", LUDWIG_VERSION) export_triton(**vars(args))
def cli_init_config(sys_argv): parser = argparse.ArgumentParser( description="This script initializes a valid config from a dataset.", prog="ludwig init_config", usage="%(prog)s [options]", ) parser.add_argument( "-d", "--dataset", type=str, help="input data file path", ) parser.add_argument( "-t", "--target", type=str, help="target(s) to predict as output features of the model", action="append", required=False, ) parser.add_argument( "--time_limit_s", type=int, help="time limit to train the model in seconds when using hyperopt", required=False, ) parser.add_argument( "--tune_for_memory", type=bool, help= "refine hyperopt search space based on available host / GPU memory", default=False, required=False, ) parser.add_argument( "--hyperopt", type=bool, help="include automl hyperopt config", default=False, required=False, ) parser.add_argument( "-o", "--output", type=str, help="output initialized YAML config path", required=False, ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("init_config", *sys_argv) print_ludwig("Init Config", LUDWIG_VERSION) init_config(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser( description="This script serves a pretrained model", prog="ludwig serve", usage="%(prog)s [options]" ) # ---------------- # Model parameters # ---------------- parser.add_argument("-m", "--model_path", help="model to load", required=True) parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) # ---------------- # Server parameters # ---------------- parser.add_argument( "-p", "--port", help="port for server (default: 8000)", default=8000, type=int, ) parser.add_argument("-H", "--host", help="host for server (default: 0.0.0.0)", default="0.0.0.0") parser.add_argument( "-ao", "--allowed_origins", nargs="*", help="A list of origins that should be permitted to make cross-origin requests. " 'Use "*" to allow any origin. See https://www.starlette.io/middleware/#corsmiddleware.', ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("serve", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.serve") print_ludwig("Serve", LUDWIG_VERSION) run_server(args.model_path, args.host, args.port, args.allowed_origins)
def cli_export_mlflow(sys_argv): parser = argparse.ArgumentParser( description='This script loads a pretrained model ' 'and saves it as an MLFlow model.', prog='ludwig export_mlflow', usage='%(prog)s [options]') # ---------------- # Model parameters # ---------------- parser.add_argument('-m', '--model_path', help='model to load', required=True) parser.add_argument( '-mn', '--registered_model_name', help='model name to upload to in MLflow model registry', default='mlflow') # ----------------- # Output parameters # ----------------- parser.add_argument('-od', '--output_path', type=str, help='path where to save the exported model', required=True) # ------------------ # Runtime parameters # ------------------ parser.add_argument( '-l', '--logging_level', default='info', help='the level of logging to use', choices=['critical', 'error', 'warning', 'info', 'debug', 'notset']) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline('export_mlflow', *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger('ludwig').setLevel(args.logging_level) global logger logger = logging.getLogger('ludwig.export') print_ludwig('Export MLFlow', LUDWIG_VERSION) export_mlflow(**vars(args))
def cli_collect_weights(sys_argv): """Command Line Interface to collecting the weights for the model. --m: Input model that is necessary to collect to the tensors, this is a required *option* --t: Tensors to collect --od: Output directory of the model, defaults to results --v: Verbose: Defines the logging level that the user will be exposed to """ parser = argparse.ArgumentParser( description="This script loads a pretrained model " "and uses it collect weights.", prog="ludwig collect_weights", usage="%(prog)s [options]", ) # ---------------- # Model parameters # ---------------- parser.add_argument("-m", "--model_path", help="model to load", required=True) parser.add_argument("-t", "--tensors", help="tensors to collect", nargs="+", required=True) # ------------------------- # Output results parameters # ------------------------- parser.add_argument( "-od", "--output_directory", type=str, default="results", help="directory that contains the results" ) # ------------------ # Runtime parameters # ------------------ parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("collect_weights", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.collect") print_ludwig("Collect Weights", LUDWIG_VERSION) collect_weights(**vars(args))
def cli_collect_summary(sys_argv): """Command Line Interface to collecting a summary of the model layers and weights. --m: Input model that is necessary to collect to the tensors, this is a required *option* --v: Verbose: Defines the logging level that the user will be exposed to """ parser = argparse.ArgumentParser( description="This script loads a pretrained model " "and prints names of weights and layers activations " "to use with other collect commands", prog="ludwig collect_summary", usage="%(prog)s [options]", ) # ---------------- # Model parameters # ---------------- parser.add_argument("-m", "--model_path", help="model to load", required=True) # ------------------ # Runtime parameters # ------------------ parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("collect_summary", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.collect") print_ludwig("Collect Summary", LUDWIG_VERSION) print_model_summary(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser( description="This script searches for optimal Hyperparameters", prog="ludwig hyperopt", usage="%(prog)s [options]", ) # ------------------- # Hyperopt parameters # ------------------- parser.add_argument( "-sshs", "--skip_save_hyperopt_statistics", help="skips saving hyperopt statistics file", action="store_true", default=False, ) # ---------------------------- # Experiment naming parameters # ---------------------------- parser.add_argument( "--output_directory", type=str, default="results", help="directory that contains the results", ) parser.add_argument("--experiment_name", type=str, default="hyperopt", help="experiment name") parser.add_argument("--model_name", type=str, default="run", help="name for the model") # --------------- # Data parameters # --------------- parser.add_argument( "--dataset", help="input data file path. " "If it has a split column, it will be used for splitting " "(0: train, 1: validation, 2: test), " "otherwise the dataset will be randomly split", ) parser.add_argument("--training_set", help="input train data file path") parser.add_argument("--validation_set", help="input validation data file path") parser.add_argument("--test_set", help="input test data file path") parser.add_argument( "--training_set_metadata", help="input metadata JSON file path. An intermediate preprocessed file " "containing the mappings of the input file created " "the first time a file is used, in the same directory " "with the same name and a .json extension", ) parser.add_argument( "--data_format", help="format of the input data", default="auto", choices=[ "auto", "csv", "excel", "feather", "fwf", "hdf5", "html" "tables", "json", "jsonl", "parquet", "pickle", "sas", "spss", "stata", "tsv", ], ) parser.add_argument( "-sspi", "--skip_save_processed_input", help="skips saving intermediate HDF5 and JSON files", action="store_true", default=False, ) # ---------------- # Model parameters # ---------------- config = parser.add_mutually_exclusive_group(required=True) config.add_argument( "-c", "--config", type=load_yaml, help="Path to the YAML file containing the model configuration", ) config.add_argument( "-cs", "--config_str", dest="config", type=load_config_from_str, help="JSON or YAML serialized string of the model configuration", ) parser.add_argument( "-mlp", "--model_load_path", help="path of a pretrained model to load as initialization", ) parser.add_argument( "-mrp", "--model_resume_path", help="path of the model directory to resume training of", ) parser.add_argument( "-sstd", "--skip_save_training_description", action="store_true", default=False, help="disables saving the description JSON file", ) parser.add_argument( "-ssts", "--skip_save_training_statistics", action="store_true", default=False, help="disables saving training statistics JSON file", ) parser.add_argument( "-ssm", "--skip_save_model", action="store_true", default=False, help="disables saving weights each time the model improves. " "By default Ludwig saves weights after each epoch " "the validation metric (improves, but if the model is really big " "that can be time consuming. If you do not want to keep " "the weights and just find out what performance a model can get " "with a set of hyperparameters, use this parameter to skip it", ) parser.add_argument( "-ssp", "--skip_save_progress", action="store_true", default=False, help="disables saving weights after each epoch. By default ludwig saves " "weights after each epoch for enabling resuming of training, but " "if the model is really big that can be time consuming and will " "save twice as much space, use this parameter to skip it", ) parser.add_argument( "-ssl", "--skip_save_log", action="store_true", default=False, help="disables saving TensorBoard logs. By default Ludwig saves " "logs for the TensorBoard, but if it is not needed turning it off " "can slightly increase the overall speed", ) # ------------------ # Runtime parameters # ------------------ parser.add_argument( "-rs", "--random_seed", type=int, default=42, help="a random seed that is going to be used anywhere there is a call " "to a random number generator: data splitting, parameter " "initialization and training set shuffling", ) parser.add_argument( "-hlv", "--hyperopt_log_verbosity", type=int, default=3, choices=[0, 1, 2, 3], help="Controls verbosity of ray tune log messages. Valid values: " "0 = silent, 1 = only status updates, 2 = status and brief trial " "results, 3 = status and detailed trial results.", ) parser.add_argument("-g", "--gpus", nargs="+", type=int, default=None, help="list of gpus to use") parser.add_argument("-gml", "--gpu_memory_limit", type=int, default=None, help="maximum memory in MB to allocate per GPU device") parser.add_argument( "-b", "--backend", help="specifies backend to use for parallel / distributed execution, " "defaults to local execution or Horovod if called using horovodrun", choices=ALL_BACKENDS, ) parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("hyperopt", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.hyperopt") args.backend = initialize_backend(args.backend or args.config.get("backend")) if args.backend.is_coordinator(): print_ludwig("Hyperopt", LUDWIG_VERSION) hyperopt_cli(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser( description='This script loads a pretrained model ' 'and uses it to predict', prog='ludwig predict', usage='%(prog)s [options]') # --------------- # Data parameters # --------------- parser.add_argument('--dataset', help='input data file path', required=True) parser.add_argument('--data_format', help='format of the input data', default='auto', choices=[ 'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5', 'html', 'tables', 'json', 'jsonl', 'parquet', 'pickle', 'sas', 'spss', 'stata', 'tsv' ]) parser.add_argument('-s', '--split', default=FULL, choices=[TRAINING, VALIDATION, TEST, FULL], help='the split to test the model on') # ---------------- # Model parameters # ---------------- parser.add_argument('-m', '--model_path', help='model to load', required=True) # ------------------------- # Output results parameters # ------------------------- parser.add_argument('-od', '--output_directory', type=str, default='results', help='directory that contains the results') parser.add_argument('-ssuo', '--skip_save_unprocessed_output', help='skips saving intermediate NPY output files', action='store_true', default=False) parser.add_argument('-sstp', '--skip_save_predictions', help='skips saving predictions CSV files', action='store_true', default=False) # ------------------ # Generic parameters # ------------------ parser.add_argument('-bs', '--batch_size', type=int, default=128, help='size of batches') # ------------------ # Runtime parameters # ------------------ parser.add_argument('-g', '--gpus', type=int, default=0, help='list of gpu to use') parser.add_argument('-gml', '--gpu_memory_limit', type=int, default=None, help='maximum memory in MB to allocate per GPU device') parser.add_argument( '-dpt', '--disable_parallel_threads', action='store_false', dest='allow_parallel_threads', help='disable TensorFlow from using multithreading for reproducibility' ) parser.add_argument( "-b", "--backend", help='specifies backend to use for parallel / distributed execution, ' 'defaults to local execution or Horovod if called using horovodrun', choices=ALL_BACKENDS, ) parser.add_argument('-dbg', '--debug', action='store_true', default=False, help='enables debugging mode') parser.add_argument( '-l', '--logging_level', default='info', help='the level of logging to use', choices=['critical', 'error', 'warning', 'info', 'debug', 'notset']) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline('predict', *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger('ludwig').setLevel(args.logging_level) global logger logger = logging.getLogger('ludwig.predict') args.backend = initialize_backend(args.backend) if args.backend.is_coordinator(): print_ludwig('Predict', LUDWIG_VERSION) logger.info('Dataset path: {}'.format(args.dataset)) logger.info('Model path: {}'.format(args.model_path)) logger.info('') predict_cli(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser( description="This script loads a pretrained model " "and evaluates its performance by comparing" "its predictions with ground truth.", prog="ludwig evaluate", usage="%(prog)s [options]", ) # --------------- # Data parameters # --------------- parser.add_argument("--dataset", help="input data file path", required=True) parser.add_argument( "--data_format", help="format of the input data", default="auto", choices=[ "auto", "csv", "excel", "feather", "fwf", "hdf5", "html" "tables", "json", "jsonl", "parquet", "pickle", "sas", "spss", "stata", "tsv", ], ) parser.add_argument("-s", "--split", default=FULL, choices=[TRAINING, VALIDATION, TEST, FULL], help="the split to test the model on") # ---------------- # Model parameters # ---------------- parser.add_argument("-m", "--model_path", help="model to load", required=True) # ------------------------- # Output results parameters # ------------------------- parser.add_argument("-od", "--output_directory", type=str, default="results", help="directory that contains the results") parser.add_argument( "-ssuo", "--skip_save_unprocessed_output", help="skips saving intermediate NPY output files", action="store_true", default=False, ) parser.add_argument( "-sses", "--skip_save_eval_stats", help="skips saving intermediate JSON eval statistics", action="store_true", default=False, ) parser.add_argument("-scp", "--skip_collect_predictions", help="skips collecting predictions", action="store_true", default=False) parser.add_argument( "-scos", "--skip_collect_overall_stats", help="skips collecting overall stats", action="store_true", default=False, ) # ------------------ # Generic parameters # ------------------ parser.add_argument("-bs", "--batch_size", type=int, default=128, help="size of batches") # ------------------ # Runtime parameters # ------------------ parser.add_argument("-g", "--gpus", type=int, default=0, help="list of gpu to use") parser.add_argument("-gml", "--gpu_memory_limit", type=int, default=None, help="maximum memory in MB to allocate per GPU device") parser.add_argument( "-dpt", "--disable_parallel_threads", action="store_false", dest="allow_parallel_threads", help="disable TensorFlow from using multithreading for reproducibility", ) parser.add_argument( "-b", "--backend", help="specifies backend to use for parallel / distributed execution, " "defaults to local execution or Horovod if called using horovodrun", choices=ALL_BACKENDS, ) parser.add_argument("-dbg", "--debug", action="store_true", default=False, help="enables debugging mode") parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.evaluate_performance = True args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("evaluate", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.test_performance") args.backend = initialize_backend(args.backend) if args.backend.is_coordinator(): print_ludwig("Evaluate", LUDWIG_VERSION) logger.info(f"Dataset path: {args.dataset}") logger.info(f"Model path: {args.model_path}") logger.info("") evaluate_cli(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser(description='This script trains a model', prog='ludwig train', usage='%(prog)s [options]') # ---------------------------- # Experiment naming parameters # ---------------------------- parser.add_argument('--output_directory', type=str, default='results', help='directory that contains the results') parser.add_argument('--experiment_name', type=str, default='experiment', help='experiment name') parser.add_argument('--model_name', type=str, default='run', help='name for the model') # --------------- # Data parameters # --------------- parser.add_argument( '--dataset', help='input data file path. ' 'If it has a split column, it will be used for splitting ' '(0: train, 1: validation, 2: test), ' 'otherwise the dataset will be randomly split') parser.add_argument('--training_set', help='input train data file path') parser.add_argument('--validation_set', help='input validation data file path') parser.add_argument('--test_set', help='input test data file path') parser.add_argument( '--training_set_metadata', help='input metadata JSON file path. An intermediate preprocessed file ' 'containing the mappings of the input file created ' 'the first time a file is used, in the same directory ' 'with the same name and a .json extension') parser.add_argument('--data_format', help='format of the input data', default='auto', choices=[ 'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5', 'html' 'tables', 'json', 'jsonl', 'parquet', 'pickle', 'sas', 'spss', 'stata', 'tsv' ]) parser.add_argument('-sspi', '--skip_save_processed_input', help='skips saving intermediate HDF5 and JSON files', action='store_true', default=False) # ---------------- # Model parameters # ---------------- config = parser.add_mutually_exclusive_group(required=True) config.add_argument( '-c', '--config', type=load_config_from_str, help='JSON or YAML serialized string of the model configuration') config.add_argument( '-cf', '--config_file', dest='config', type=load_yaml, help='Path to the YAML file containing the model configuration') parser.add_argument( '-mlp', '--model_load_path', help='path of a pretrained model to load as initialization') parser.add_argument( '-mrp', '--model_resume_path', help='path of the model directory to resume training of') parser.add_argument('-sstd', '--skip_save_training_description', action='store_true', default=False, help='disables saving the description JSON file') parser.add_argument('-ssts', '--skip_save_training_statistics', action='store_true', default=False, help='disables saving training statistics JSON file') parser.add_argument( '-ssm', '--skip_save_model', action='store_true', default=False, help='disables saving weights each time the model improves. ' 'By default Ludwig saves weights after each epoch ' 'the validation metric (improves, but if the model is really big ' 'that can be time consuming. If you do not want to keep ' 'the weights and just find out what performance a model can get ' 'with a set of hyperparameters, use this parameter to skip it') parser.add_argument( '-ssp', '--skip_save_progress', action='store_true', default=False, help='disables saving weights after each epoch. By default ludwig saves ' 'weights after each epoch for enabling resuming of training, but ' 'if the model is really big that can be time consuming and will ' 'save twice as much space, use this parameter to skip it') parser.add_argument( '-ssl', '--skip_save_log', action='store_true', default=False, help='disables saving TensorBoard logs. By default Ludwig saves ' 'logs for the TensorBoard, but if it is not needed turning it off ' 'can slightly increase the overall speed') # ------------------ # Runtime parameters # ------------------ parser.add_argument( '-rs', '--random_seed', type=int, default=42, help='a random seed that is going to be used anywhere there is a call ' 'to a random number generator: data splitting, parameter ' 'initialization and training set shuffling') parser.add_argument('-g', '--gpus', nargs='+', type=int, default=None, help='list of gpus to use') parser.add_argument('-gml', '--gpu_memory_limit', type=int, default=None, help='maximum memory in MB to allocate per GPU device') parser.add_argument( '-dpt', '--disable_parallel_threads', action='store_false', dest='allow_parallel_threads', help='disable TensorFlow from using multithreading for reproducibility' ) parser.add_argument( "-b", "--backend", help='specifies backend to use for parallel / distributed execution, ' 'defaults to local execution or Horovod if called using horovodrun', choices=ALL_BACKENDS, ) parser.add_argument('-dbg', '--debug', action='store_true', default=False, help='enables debugging mode') parser.add_argument( '-l', '--logging_level', default='info', help='the level of logging to use', choices=['critical', 'error', 'warning', 'info', 'debug', 'notset']) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline('train', *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger('ludwig').setLevel(args.logging_level) global logger logger = logging.getLogger('ludwig.train') args.backend = initialize_backend(args.backend or args.config.get('backend')) if args.backend.is_coordinator(): print_ludwig('Train', LUDWIG_VERSION) train_cli(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser( description="This script generates a synthetic dataset.", prog="ludwig synthesize_dataset", usage="%(prog)s [options]", ) parser.add_argument("-od", "--output_path", type=str, help="output CSV file path") parser.add_argument("-d", "--dataset_size", help="size of the dataset", type=int, default=100) parser.add_argument( "-f", "--features", default="[\ {name: text_1, type: text, vocab_size: 20, max_len: 20}, \ {name: text_2, type: text, vocab_size: 20, max_len: 20}, \ {name: category_1, type: category, vocab_size: 10}, \ {name: category_2, type: category, vocab_size: 15}, \ {name: number_1, type: number}, \ {name: number_2, type: number}, \ {name: binary_1, type: binary}, \ {name: binary_2, type: binary}, \ {name: set_1, type: set, vocab_size: 20, max_len: 20}, \ {name: set_2, type: set, vocab_size: 20, max_len: 20}, \ {name: bag_1, type: bag, vocab_size: 20, max_len: 10}, \ {name: bag_2, type: bag, vocab_size: 20, max_len: 10}, \ {name: sequence_1, type: sequence, vocab_size: 20, max_len: 20}, \ {name: sequence_2, type: sequence, vocab_size: 20, max_len: 20}, \ {name: timeseries_1, type: timeseries, max_len: 20}, \ {name: timeseries_2, type: timeseries, max_len: 20}, \ {name: date_1, type: date}, \ {name: date_2, type: date}, \ {name: h3_1, type: h3}, \ {name: h3_2, type: h3}, \ {name: vector_1, type: vector}, \ {name: vector_2, type: vector}, \ ]", type=yaml.safe_load, help="list of features to generate in YAML format. " "Provide a list containing one dictionary for each feature, " "each dictionary must include a name, a type " "and can include some generation parameters depending on the type", ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("synthesize_dataset", *sys_argv) # No log level parameter this is placeholder if we add at later date # args.logging_level = logging_level_registry[args.logging_level] # logging.getLogger('ludwig').setLevel( # args.logging_level # ) # global logger # logger = logging.getLogger('ludwig.data.dataset_synthesizer') print_ludwig("Synthesize Dataset", LUDWIG_VERSION) cli_synthesize_dataset(**vars(args))
def cli_collect_activations(sys_argv): """Command Line Interface to communicate with the collection of tensors and there are several options that can specified when calling this function: --data_csv: Filepath for the input csv --data_hdf5: Filepath for the input hdf5 file, if there is a csv file, this is not read --d: Refers to the dataset type of the file being read, by default is *generic* --s: Refers to the split of the data, can be one of: train, test, validation, full --m: Input model that is necessary to collect to the tensors, this is a required *option* --t: Tensors to collect --od: Output directory of the model, defaults to results --bs: Batch size --g: Number of gpus that are to be used --gf: Fraction of each GPUs memory to use. --dbg: Debug if the model is to be started with python debugger --v: Verbose: Defines the logging level that the user will be exposed to """ parser = argparse.ArgumentParser( description="This script loads a pretrained model and uses it collect " "tensors for each datapoint in the dataset.", prog="ludwig collect_activations", usage="%(prog)s [options]", ) # --------------- # Data parameters # --------------- parser.add_argument("--dataset", help="input data file path", required=True) parser.add_argument( "--data_format", help="format of the input data", default="auto", choices=[ "auto", "csv", "excel", "feather", "fwf", "hdf5", "html" "tables", "json", "jsonl", "parquet", "pickle", "sas", "spss", "stata", "tsv", ], ) parser.add_argument( "-s", "--split", default=FULL, choices=[TRAINING, VALIDATION, TEST, FULL], help="the split to obtain the model activations from", ) # ---------------- # Model parameters # ---------------- parser.add_argument("-m", "--model_path", help="model to load", required=True) parser.add_argument("-lyr", "--layers", help="tensors to collect", nargs="+", required=True) # ------------------------- # Output results parameters # ------------------------- parser.add_argument("-od", "--output_directory", type=str, default="results", help="directory that contains the results") # ------------------ # Generic parameters # ------------------ parser.add_argument("-bs", "--batch_size", type=int, default=128, help="size of batches") # ------------------ # Runtime parameters # ------------------ parser.add_argument("-g", "--gpus", type=int, default=0, help="list of gpu to use") parser.add_argument("-gml", "--gpu_memory_limit", type=int, default=None, help="maximum memory in MB to allocate per GPU device") parser.add_argument( "-dpt", "--disable_parallel_threads", action="store_false", dest="allow_parallel_threads", help="disable TensorFlow from using multithreading for reproducibility", ) parser.add_argument( "-b", "--backend", help="specifies backend to use for parallel / distributed execution, " "defaults to local execution or Horovod if called using horovodrun", choices=ALL_BACKENDS, ) parser.add_argument("-dbg", "--debug", action="store_true", default=False, help="enables debugging mode") parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("collect_activations", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.collect") print_ludwig("Collect Activations", LUDWIG_VERSION) collect_activations(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser( description='This script preprocess a dataset', prog='ludwig preprocess', usage='%(prog)s [options]') # --------------- # Data parameters # --------------- parser.add_argument( '--dataset', help='input data file path. ' 'If it has a split column, it will be used for splitting ' '(0: train, 1: validation, 2: test), ' 'otherwise the dataset will be randomly split') parser.add_argument('--training_set', help='input train data file path') parser.add_argument('--validation_set', help='input validation data file path') parser.add_argument('--test_set', help='input test data file path') parser.add_argument( '--training_set_metadata', help='input metadata JSON file path. An intermediate preprocessed file ' 'containing the mappings of the input file created ' 'the first time a file is used, in the same directory ' 'with the same name and a .json extension') parser.add_argument('--data_format', help='format of the input data', default='auto', choices=[ 'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5', 'html' 'tables', 'json', 'jsonl', 'parquet', 'pickle', 'sas', 'spss', 'stata', 'tsv' ]) # ---------------- # Model parameters # ---------------- preprocessing_def = parser.add_mutually_exclusive_group(required=True) preprocessing_def.add_argument( '-pc', '--preprocessing_config', type=yaml.safe_load, help='preproceesing config. ' 'Uses the same format of config, ' 'but ignores encoder specific parameters, ' 'decoder specific paramters, combiner and training parameters') preprocessing_def.add_argument( '-pcf', '--preprocessing_config_file', dest='preprocessing_config', type=load_yaml, help='YAML file describing the preprocessing. ' 'Ignores --preprocessing_config.' 'Uses the same format of config, ' 'but ignores encoder specific parameters, ' 'decoder specific paramters, combiner and training parameters') # ------------------ # Runtime parameters # ------------------ parser.add_argument( '-rs', '--random_seed', type=int, default=42, help='a random seed that is going to be used anywhere there is a call ' 'to a random number generator: data splitting, parameter ' 'initialization and training set shuffling') parser.add_argument( "-b", "--backend", help='specifies backend to use for parallel / distributed execution, ' 'defaults to local execution or Horovod if called using horovodrun', choices=ALL_BACKENDS, ) parser.add_argument('-dbg', '--debug', action='store_true', default=False, help='enables debugging mode') parser.add_argument( '-l', '--logging_level', default='info', help='the level of logging to use', choices=['critical', 'error', 'warning', 'info', 'debug', 'notset']) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline('preprocess', *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger('ludwig').setLevel(args.logging_level) global logger logger = logging.getLogger('ludwig.preprocess') args.backend = initialize_backend(args.backend) if args.backend.is_coordinator(): print_ludwig('Preprocess', LUDWIG_VERSION) preprocess_cli(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser( description="This script trains and evaluates a model", prog="ludwig experiment", usage="%(prog)s [options]" ) # ---------------------------- # Experiment naming parameters # ---------------------------- parser.add_argument("--output_directory", type=str, default="results", help="directory that contains the results") parser.add_argument("--experiment_name", type=str, default="experiment", help="experiment name") parser.add_argument("--model_name", type=str, default="run", help="name for the model") # --------------- # Data parameters # --------------- parser.add_argument( "--dataset", help="input data file path. " "If it has a split column, it will be used for splitting " "(0: train, 1: validation, 2: test), " "otherwise the dataset will be randomly split", ) parser.add_argument("--training_set", help="input train data file path") parser.add_argument("--validation_set", help="input validation data file path") parser.add_argument("--test_set", help="input test data file path") parser.add_argument( "--training_set_metadata", help="input metadata JSON file path. An intermediate preprocessed file " "containing the mappings of the input file created " "the first time a file is used, in the same directory " "with the same name and a .json extension", ) parser.add_argument( "--data_format", help="format of the input data", default="auto", choices=[ "auto", "csv", "excel", "feather", "fwf", "hdf5", "html" "tables", "json", "jsonl", "parquet", "pickle", "sas", "spss", "stata", "tsv", ], ) parser.add_argument( "-es", "--eval_split", default=TEST, choices=[TRAINING, VALIDATION, TEST, FULL], help="the split to evaluate the model on", ) parser.add_argument( "-sspi", "--skip_save_processed_input", help="skips saving intermediate HDF5 and JSON files", action="store_true", default=False, ) parser.add_argument( "-ssuo", "--skip_save_unprocessed_output", help="skips saving intermediate NPY output files", action="store_true", default=False, ) # ----------------- # K-fold parameters # ----------------- parser.add_argument( "-kf", "--k_fold", type=int, default=None, help="number of folds for a k-fold cross validation run " ) parser.add_argument( "-skfsi", "--skip_save_k_fold_split_indices", action="store_true", default=False, help="disables saving indices generated to split training data set " "for the k-fold cross validation run, but if it is not needed " "turning it off can slightly increase the overall speed", ) # ---------------- # Model parameters # ---------------- config = parser.add_mutually_exclusive_group(required=True) config.add_argument( "-c", "--config", type=load_config_from_str, help="JSON or YAML serialized string of the model configuration" ) config.add_argument( "-cf", "--config_file", dest="config", type=load_yaml, help="Path to the YAML file containing the model configuration", ) parser.add_argument("-mlp", "--model_load_path", help="path of a pretrained model to load as initialization") parser.add_argument("-mrp", "--model_resume_path", help="path of the model directory to resume training of") parser.add_argument( "-sstd", "--skip_save_training_description", action="store_true", default=False, help="disables saving the description JSON file", ) parser.add_argument( "-ssts", "--skip_save_training_statistics", action="store_true", default=False, help="disables saving training statistics JSON file", ) parser.add_argument( "-sstp", "--skip_save_predictions", help="skips saving test predictions CSV files", action="store_true", default=False, ) parser.add_argument( "-sstes", "--skip_save_eval_stats", help="skips saving eval statistics JSON file", action="store_true", default=False, ) parser.add_argument( "-ssm", "--skip_save_model", action="store_true", default=False, help="disables saving model weights and hyperparameters each time " "the model improves. " "By default Ludwig saves model weights after each epoch " "the validation metric imprvoes, but if the model is really big " "that can be time consuming. If you do not want to keep " "the weights and just find out what performance a model can get " "with a set of hyperparameters, use this parameter to skip it," "but the model will not be loadable later on", ) parser.add_argument( "-ssp", "--skip_save_progress", action="store_true", default=False, help="disables saving progress each epoch. By default Ludwig saves " "weights and stats after each epoch for enabling resuming " "of training, but if the model is really big that can be " "time consuming and will uses twice as much space, use " "this parameter to skip it, but training cannot be resumed " "later on", ) parser.add_argument( "-ssl", "--skip_save_log", action="store_true", default=False, help="disables saving TensorBoard logs. By default Ludwig saves " "logs for the TensorBoard, but if it is not needed turning it off " "can slightly increase the overall speed", ) # ------------------ # Runtime parameters # ------------------ parser.add_argument( "-rs", "--random_seed", type=int, default=42, help="a random seed that is going to be used anywhere there is a call " "to a random number generator: data splitting, parameter " "initialization and training set shuffling", ) parser.add_argument("-g", "--gpus", nargs="+", type=int, default=None, help="list of GPUs to use") parser.add_argument( "-gml", "--gpu_memory_limit", type=int, default=None, help="maximum memory in MB to allocate per GPU device" ) parser.add_argument( "-dpt", "--disable_parallel_threads", action="store_false", dest="allow_parallel_threads", help="disable TensorFlow from using multithreading for reproducibility", ) parser.add_argument( "-b", "--backend", help="specifies backend to use for parallel / distributed execution, " "defaults to local execution or Horovod if called using horovodrun", choices=ALL_BACKENDS, ) parser.add_argument("-dbg", "--debug", action="store_true", default=False, help="enables debugging mode") parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("experiment", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.experiment") args.backend = initialize_backend(args.backend or args.config.get("backend")) if args.backend.is_coordinator(): print_ludwig("Experiment", LUDWIG_VERSION) if args.k_fold is None: experiment_cli(**vars(args)) else: kfold_cross_validate_cli(**vars(args))
def cli(sys_argv): parser = argparse.ArgumentParser( description='This script serves a pretrained model', prog='ludwig serve', usage='%(prog)s [options]') # ---------------- # Model parameters # ---------------- parser.add_argument('-m', '--model_path', help='model to load', required=True) parser.add_argument( '-l', '--logging_level', default='info', help='the level of logging to use', choices=['critical', 'error', 'warning', 'info', 'debug', 'notset']) # ---------------- # Server parameters # ---------------- parser.add_argument( '-p', '--port', help='port for server (default: 8000)', default=8000, type=int, ) parser.add_argument('-H', '--host', help='host for server (default: 0.0.0.0)', default='0.0.0.0') parser.add_argument( '-ao', '--allowed_origins', nargs='*', help= 'A list of origins that should be permitted to make cross-origin requests. ' 'Use "*" to allow any origin. See https://www.starlette.io/middleware/#corsmiddleware.', ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline('serve', *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger('ludwig').setLevel(args.logging_level) global logger logger = logging.getLogger('ludwig.serve') print_ludwig('Serve', LUDWIG_VERSION) run_server(args.model_path, args.host, args.port, args.allowed_origins)
def cli(sys_argv): parser = argparse.ArgumentParser( description="This script preprocess a dataset", prog="ludwig preprocess", usage="%(prog)s [options]") # --------------- # Data parameters # --------------- parser.add_argument( "--dataset", help="input data file path. " "If it has a split column, it will be used for splitting " "(0: train, 1: validation, 2: test), " "otherwise the dataset will be randomly split", ) parser.add_argument("--training_set", help="input train data file path") parser.add_argument("--validation_set", help="input validation data file path") parser.add_argument("--test_set", help="input test data file path") parser.add_argument( "--training_set_metadata", help="input metadata JSON file path. An intermediate preprocessed file " "containing the mappings of the input file created " "the first time a file is used, in the same directory " "with the same name and a .json extension", ) parser.add_argument( "--data_format", help="format of the input data", default="auto", choices=[ "auto", "csv", "excel", "feather", "fwf", "hdf5", "html" "tables", "json", "jsonl", "parquet", "pickle", "sas", "spss", "stata", "tsv", ], ) # ---------------- # Model parameters # ---------------- preprocessing_def = parser.add_mutually_exclusive_group(required=True) preprocessing_def.add_argument( "-pc", "--preprocessing_config", dest="preprocessing_config", type=load_yaml, help="YAML file describing the preprocessing. " "Ignores --preprocessing_config." "Uses the same format of config, " "but ignores encoder specific parameters, " "decoder specific parameters, combiner and training parameters", ) preprocessing_def.add_argument( "-pcs", "--preprocessing_config_str", type=yaml.safe_load, help="preproceesing config. " "Uses the same format of config, " "but ignores encoder specific parameters, " "decoder specific parameters, combiner and training parameters", ) # ------------------ # Runtime parameters # ------------------ parser.add_argument( "-rs", "--random_seed", type=int, default=42, help="a random seed that is going to be used anywhere there is a call " "to a random number generator: data splitting, parameter " "initialization and training set shuffling", ) parser.add_argument( "-b", "--backend", help="specifies backend to use for parallel / distributed execution, " "defaults to local execution or Horovod if called using horovodrun", choices=ALL_BACKENDS, ) parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("preprocess", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.preprocess") args.backend = initialize_backend(args.backend) if args.backend.is_coordinator(): print_ludwig("Preprocess", LUDWIG_VERSION) preprocess_cli(**vars(args))
def cli_export_torchscript(sys_argv): parser = argparse.ArgumentParser( description="This script loads a pretrained model " "and saves it as torchscript.", prog="ludwig export_torchscript", usage="%(prog)s [options]", ) # ---------------- # Model parameters # ---------------- parser.add_argument("-m", "--model_path", help="model to load", required=True) parser.add_argument( "-mo", "--model_only", help="Script and export the model only.", action="store_true", ) parser.add_argument( "-d", "--device", type=str, help= ('Device to use for torchscript tracing (e.g. "cuda" or "cpu"). Ideally, this is the same as the device ' "used when the model is loaded."), default=None, ) # ----------------- # Output parameters # ----------------- parser.add_argument("-od", "--output_path", type=str, help="path where to save the export model", required=True) # ------------------ # Runtime parameters # ------------------ parser.add_argument( "-l", "--logging_level", default="info", help="the level of logging to use", choices=["critical", "error", "warning", "info", "debug", "notset"], ) add_contrib_callback_args(parser) args = parser.parse_args(sys_argv) args.callbacks = args.callbacks or [] for callback in args.callbacks: callback.on_cmdline("export_torchscript", *sys_argv) args.logging_level = logging_level_registry[args.logging_level] logging.getLogger("ludwig").setLevel(args.logging_level) global logger logger = logging.getLogger("ludwig.export") print_ludwig("Export Torchscript", LUDWIG_VERSION) export_torchscript(**vars(args))