def __init__(self, model_definition=None, model_definition_file=None, logging_level=logging.ERROR): # check for model_definition and model_definition_file if model_definition is None and model_definition_file is None: raise ValueError( 'Either model_definition of model_definition_file have to be' 'not None to initialize a LudwigModel') if model_definition is not None and model_definition_file is not None: raise ValueError('Only one between model_definition and ' 'model_definition_file can be provided') self.set_logging_level(logging_level) if model_definition_file is not None: with open(model_definition_file, 'r') as def_file: self.model_definition = merge_with_defaults( yaml.safe_load(def_file)) else: model_definition_copy = copy.deepcopy(model_definition) self.model_definition = merge_with_defaults(model_definition_copy) self.train_set_metadata = None self.model = None self.exp_dir_name = ''
def test_invalid_trainer_type(model_type): config = { INPUT_FEATURES: [category_feature()], OUTPUT_FEATURES: [category_feature()], MODEL_TYPE: model_type, "trainer": { "type": "invalid_trainer" }, } with pytest.raises(ValidationError): merge_with_defaults(config)
def __init__(self, model_definition, model_definition_file=None, logging_level=logging.ERROR): logging.getLogger().setLevel(logging_level) if model_definition_file is not None: with open(model_definition_file, 'r') as def_file: self.model_definition = merge_with_defaults( yaml.load(def_file)) else: self.model_definition = merge_with_defaults(model_definition) self.train_set_metadata = None self.model = None
def __init__(self, model_definition, model_definition_file=None, logging_level=logging.ERROR): logging.getLogger('ludwig').setLevel(logging_level) if model_definition_file is not None: with open(model_definition_file, 'r') as def_file: self.model_definition = merge_with_defaults( yaml.safe_load(def_file)) else: model_definition_copy = copy.deepcopy(model_definition) self.model_definition = merge_with_defaults(model_definition_copy) self.train_set_metadata = None self.model = None self.exp_dir_name = None
def test_global_default_parameters_merge_with_defaults(csv_filename): config, _ = _prepare_data(csv_filename) updated_config = merge_with_defaults(config) assert DEFAULTS in updated_config # Make sure no type specific parameters are in preprocessing input_feature_types = set(input_type_registry) for parameter in updated_config[PREPROCESSING]: assert parameter not in input_feature_types # All feature-specific preprocessing parameters should be in defaults defaults_with_preprocessing = [ feature for feature in updated_config[DEFAULTS] if PREPROCESSING in updated_config[DEFAULTS][feature] ] assert len(defaults_with_preprocessing) == len(input_feature_types) # Feature encoders and decoders should update for feature in updated_config[INPUT_FEATURES]: assert feature[ENCODER] == updated_config[DEFAULTS][ feature[TYPE]][ENCODER][TYPE] output_feature = updated_config[OUTPUT_FEATURES][0] assert output_feature[DECODER] == updated_config[DEFAULTS][ output_feature[TYPE]][DECODER][TYPE]
def _setup_ludwig_config(dataset_fp: str) -> Tuple[Dict, str]: input_features = [ text_feature(name="utterance", reduce_output="sum"), category_feature(vocab_size=3), ] output_features = [category_feature(vocab_size=3)] rel_path = generate_data(input_features, output_features, dataset_fp) config = { INPUT_FEATURES: input_features, OUTPUT_FEATURES: output_features, COMBINER: { "type": "concat", "num_fc_layers": 2 }, TRAINER: { "epochs": 2, "learning_rate": 0.001 }, } config = merge_with_defaults(config) return config, rel_path
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler): all_input_features = [ binary_feature(), category_feature(), number_feature(), text_feature(), ] all_output_features = [ category_feature(), sequence_feature(), vector_feature(), ] # validate config with all features config = { INPUT_FEATURES: all_input_features, OUTPUT_FEATURES: all_output_features, HYPEROPT: HYPEROPT_CONFIG, } config = copy.deepcopy(config) if use_train: config[TRAINER] = {"batch_size": 42} if use_hyperopt_scheduler: # hyperopt scheduler cannot be used with early stopping config[HYPEROPT][EXECUTOR][SCHEDULER] = SCHEDULER_DICT merged_config = merge_with_defaults(config) expected = -1 if use_hyperopt_scheduler else ECDTrainerConfig().early_stop assert merged_config[TRAINER]["early_stop"] == expected
def test_validate_with_preprocessing_defaults(): config = { "input_features": [ audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults), bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults), binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults), category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults), date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults), h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults), image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults), numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults), sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults), set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults), text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults), timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults), vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults), ], "output_features": [{"name": "target", "type": "category"}], "training": { "decay": True, "learning_rate": 0.001, "validation_field": "target", "validation_metric": "accuracy", }, } validate_config(config) config = merge_with_defaults(config) validate_config(config)
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler): all_input_features = [ binary_feature(), category_feature(), numerical_feature(), text_feature(), ] all_output_features = [ category_feature(), sequence_feature(), vector_feature(), ] # validate config with all features config = { "input_features": all_input_features, "output_features": all_output_features, HYPEROPT: HYPEROPT_CONFIG, } config = copy.deepcopy(config) if use_train: config[TRAINING] = {"batch_size": "42"} if use_hyperopt_scheduler: # hyperopt scheduler cannot be used with early stopping config[HYPEROPT]["sampler"]["scheduler"] = SCHEDULER merged_config = merge_with_defaults(config) expected = -1 if use_hyperopt_scheduler else default_early_stop assert merged_config[TRAINING]["early_stop"] == expected
def test_missing_outputs_drop_rows(): config = { INPUT_FEATURES: [category_feature()], OUTPUT_FEATURES: [category_feature()], DEFAULTS: { CATEGORY: { PREPROCESSING: { MISSING_VALUE_STRATEGY: FILL_WITH_MODE } } }, } merged_config = merge_with_defaults(config) global_preprocessing = merged_config[DEFAULTS] input_feature_config = merged_config[INPUT_FEATURES][0] output_feature_config = merged_config[OUTPUT_FEATURES][0] assert output_feature_config[PREPROCESSING][ MISSING_VALUE_STRATEGY] == DROP_ROW feature_preprocessing = merge_dict( global_preprocessing[output_feature_config[TYPE]][PREPROCESSING], output_feature_config[PREPROCESSING]) assert feature_preprocessing[MISSING_VALUE_STRATEGY] == DROP_ROW feature_preprocessing = global_preprocessing[ input_feature_config[TYPE]][PREPROCESSING] assert feature_preprocessing[MISSING_VALUE_STRATEGY] == FILL_WITH_MODE
def test_deprecated_field_aliases(): config = { "input_features": [{"name": "num_in", "type": "numerical"}], "output_features": [{"name": "num_out", "type": "numerical"}], "training": { "epochs": 2, }, "hyperopt": { "parameters": { "training.learning_rate": { "space": "loguniform", "lower": 0.001, "upper": 0.1, }, }, "goal": "minimize", }, } merged_config = merge_with_defaults(config) assert merged_config["input_features"][0][TYPE] == NUMBER assert merged_config["output_features"][0][TYPE] == NUMBER assert "training" not in merged_config assert merged_config[TRAINER]["epochs"] == 2 hparams = merged_config[HYPEROPT]["parameters"] assert "training.learning_rate" not in hparams assert "trainer.learning_rate" in hparams
def get_preprocessing_params(model_definition): model_definition = merge_with_defaults(model_definition) global_preprocessing_parameters = model_definition['preprocessing'] features = ( model_definition['input_features'] + model_definition['output_features'] ) global_preprocessing_parameters = merge_dict( default_preprocessing_parameters, global_preprocessing_parameters ) merged_preprocessing_params = [] for feature in features: if 'preprocessing' in feature: local_preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature['type']], feature['preprocessing'] ) else: local_preprocessing_parameters = global_preprocessing_parameters[ feature['type'] ] merged_preprocessing_params.append( (feature['name'], feature['type'], local_preprocessing_parameters) ) return merged_preprocessing_params
def test_train_online(data_csv, model_definition, batch_size=128, debug=False, logging_level=logging.ERROR, **kwargs): model_definition = merge_with_defaults(model_definition) data, train_set_metadata = build_dataset( data_csv, (model_definition['input_features'] + model_definition['output_features']), model_definition['preprocessing']) ludwig_model = LudwigModel(model_definition, logging_level=logging_level) ludwig_model.initialize_model(train_set_metadata=train_set_metadata) ludwig_model.train_online( data_csv=data_csv, batch_size=128, ) ludwig_model.train_online( data_csv=data_csv, batch_size=128, ) # predict predictions = ludwig_model.predict( data_csv=data_csv, batch_size=batch_size, ) ludwig_model.close() logger.critical(predictions)
def run_hyperopt_executor( sampler, executor, csv_filename, ray_mock_dir, validate_output_feature=False, validation_metric=None, ): with ray_start_4_cpus(): config = _get_config(sampler, executor) csv_filename = os.path.join(ray_mock_dir, "dataset.csv") dataset_csv = generate_data(config["input_features"], config["output_features"], csv_filename, num_examples=100) dataset_parquet = create_data_set_to_use("parquet", dataset_csv) config = merge_with_defaults(config) hyperopt_config = config["hyperopt"] if validate_output_feature: hyperopt_config["output_feature"] = config["output_features"][0]["name"] if validation_metric: hyperopt_config["validation_metric"] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] if sampler.get("search_alg", {}).get("type", "") == "bohb": # bohb does not support grid_search search space del parameters["combiner.num_steps"] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler) # preprocess backend = RayBackend(**RAY_BACKEND_KWARGS) model = LudwigModel(config=config, backend=backend) training_set, validation_set, test_set, training_set_metadata = model.preprocess( dataset=dataset_parquet, ) # hyperopt hyperopt_executor = MockRayTuneExecutor(hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.mock_path = os.path.join(ray_mock_dir, "bucket") hyperopt_executor.execute( config, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, backend=backend, output_directory=ray_mock_dir, skip_save_processed_input=True, skip_save_unprocessed_output=True, )
def test_hyperopt_executor(sampler, executor, csv_filename, validate_output_feature=False, validation_metric=None): if executor['type'] == 'fiber' and sampler['type'] == 'grid': # This test is very slow and doesn't give us additional converage pytest.skip('Skipping Fiber grid search') input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum") ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "num_fc_layers": 2 }, "training": { "epochs": 2, "learning_rate": 0.001 } } config = merge_with_defaults(config) hyperopt_config = HYPEROPT_CONFIG.copy() if validate_output_feature: hyperopt_config['output_feature'] = output_features[0]['name'] if validation_metric: hyperopt_config['validation_metric'] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler) hyperopt_executor = get_build_hyperopt_executor(executor["type"])( hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.execute(config, dataset=rel_path, gpus=get_available_gpus_cuda_string())
def test_default_model_type(): config = { INPUT_FEATURES: [category_feature()], OUTPUT_FEATURES: [category_feature()], } merged_config = merge_with_defaults(config) assert merged_config[MODEL_TYPE] == MODEL_ECD
def memory_tune_config(config, dataset): fits_in_memory = False raw_config = merge_with_defaults(config) training_set_metadata = get_trainingset_metadata(raw_config, dataset) modified_hyperparam_search_space = copy.deepcopy( raw_config[HYPEROPT]["parameters"]) params_to_modify = RANKED_MODIFIABLE_PARAM_LIST[get_model_name(raw_config)] param_list = list(params_to_modify.keys()) current_param_values = {} max_memory = get_machine_memory() while param_list is not None: # compute memory utilization current_param_values = get_new_params( current_param_values, modified_hyperparam_search_space, params_to_modify) temp_config = sub_new_params(raw_config, current_param_values) if compute_memory_usage(temp_config, training_set_metadata) < max_memory: fits_in_memory = True break # check if we have exhausted tuning of current param (e.g. we can no longer reduce the param value) param, min_value = param_list[0], params_to_modify[param_list[0]] if param in modified_hyperparam_search_space.keys(): param_space = modified_hyperparam_search_space[param]["space"] if param_space == "choice": if (len(modified_hyperparam_search_space[param]["categories"]) > 2 and modified_hyperparam_search_space[param] ["categories"][-2] > min_value): modified_hyperparam_search_space[param][ "categories"] = modified_hyperparam_search_space[ param]["categories"][:-1] else: param_list.pop(0) # exhausted reduction of this parameter else: # reduce by 10% upper_bound, lower_bound = ( modified_hyperparam_search_space[param]["upper"], modified_hyperparam_search_space[param]["lower"], ) reduction_val = (upper_bound - lower_bound) * 0.1 new_upper_bound = upper_bound - reduction_val if (new_upper_bound ) > lower_bound and new_upper_bound > min_value: modified_hyperparam_search_space[param][ "upper"] = new_upper_bound else: param_list.pop(0) # exhausted reduction of this parameter else: param_list.pop(0) # param not in hyperopt search space modified_config = copy.deepcopy(config) modified_config[HYPEROPT]["parameters"] = modified_hyperparam_search_space return modified_config, fits_in_memory
def test_deprecated_field_aliases(): config = { INPUT_FEATURES: [{ "name": "num_in", "type": "numerical" }], OUTPUT_FEATURES: [{ "name": "num_out", "type": "numerical" }], "training": { "epochs": 2, "eval_batch_size": 0, }, HYPEROPT: { "parameters": { "training.learning_rate": { "space": "loguniform", "lower": 0.001, "upper": 0.1, }, }, "goal": "minimize", "sampler": { "type": "grid", "num_samples": 2, "scheduler": { "type": "fifo" } }, "executor": { "type": "grid", "search_alg": "bohb", }, }, } merged_config = merge_with_defaults(config) assert merged_config["input_features"][0][TYPE] == NUMBER assert merged_config["output_features"][0][TYPE] == NUMBER assert "training" not in merged_config assert merged_config[TRAINER]["epochs"] == 2 assert merged_config[TRAINER][EVAL_BATCH_SIZE] is None hparams = merged_config[HYPEROPT]["parameters"] assert "training.learning_rate" not in hparams assert "trainer.learning_rate" in hparams assert "sampler" not in merged_config[HYPEROPT] assert merged_config[HYPEROPT]["executor"]["type"] == "ray" assert "num_samples" in merged_config[HYPEROPT]["executor"] assert "scheduler" in merged_config[HYPEROPT]["executor"]
def __init__(self, model_definition, logging_level=logging.ERROR, use_horovod=None, gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, random_seed=default_random_seed): """ :param model_definition: (dict, string) in-memory representation of model definition or string path to the saved JSON model definition file. :param model_definition_fp: (string) path to user-defined definition YAML file. :param logging_level: Log level that will be sent to stderr. :param use_horovod: (bool) use Horovod for distributed training. Will be set automatically if `horovodrun` is used to launch the training script. :param gpus: (string, default: `None`) list of GPUs to use (it uses the same syntax of CUDA_VISIBLE_DEVICES) :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. """ # check if model definition is a path or a dict if isinstance(model_definition, str): # assume path with open(model_definition, 'r') as def_file: model_definition_dict = yaml.safe_load(def_file) self.model_definition_fp = model_definition else: model_definition_dict = copy.deepcopy(model_definition) self.model_definition_fp = None # merge model definition with defaults self.model_definition = merge_with_defaults(model_definition_dict) # setup horovod self._horovod = configure_horovod(use_horovod) # setup logging self.set_logging_level(logging_level) # setup TensorFlow initialize_tensorflow(gpus, gpu_memory_limit, allow_parallel_threads, self._horovod) # todo refactoring: decide where to put this, # here or at the beginning of training. # Either way make sure it is called before the model is initialized. # tf.random.set_seed(random_seed) # setup model self.model = None self.training_set_metadata = None # online training state self._online_trainer = None
def test_hyperopt_executor(sampler, executor, csv_filename, validate_output_feature=False, validation_metric=None): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum") ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) model_definition = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "num_fc_layers": 2 }, "training": { "epochs": 2, "learning_rate": 0.001 } } model_definition = merge_with_defaults(model_definition) hyperopt_config = HYPEROPT_CONFIG.copy() if validate_output_feature: hyperopt_config['output_feature'] = output_features[0]['name'] if validation_metric: hyperopt_config['validation_metric'] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler) hyperopt_executor = get_build_hyperopt_executor(executor["type"])( hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.execute(model_definition, dataset=rel_path, gpus=get_available_gpus_cuda_string())
def test_default_trainer_type(model_trainer_type): model_type, expected_trainer_type = model_trainer_type config = { INPUT_FEATURES: [category_feature()], OUTPUT_FEATURES: [category_feature()], MODEL_TYPE: model_type, } merged_config = merge_with_defaults(config) assert merged_config[TRAINER][TYPE] == expected_trainer_type
def test_hyperopt_executor(sampler, executor, csv_filename, validate_output_feature=False, validation_metric=None): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum"), ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "num_fc_layers": 2 }, TRAINER: { "epochs": 2, "learning_rate": 0.001 }, } config = merge_with_defaults(config) hyperopt_config = HYPEROPT_CONFIG.copy() if validate_output_feature: hyperopt_config["output_feature"] = output_features[0]["name"] if validation_metric: hyperopt_config["validation_metric"] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler) hyperopt_executor = get_build_hyperopt_executor(executor["type"])( hyperopt_sampler, output_feature, metric, split, **executor) gpus = [i for i in range(torch.cuda.device_count())] hyperopt_executor.execute(config, dataset=rel_path, gpus=gpus)
def test_config_features(): all_input_features = [ audio_feature("/tmp/destination_folder"), bag_feature(), binary_feature(), category_feature(), date_feature(), h3_feature(), image_feature("/tmp/destination_folder"), number_feature(), sequence_feature(), set_feature(), text_feature(), timeseries_feature(), vector_feature(), ] all_output_features = [ binary_feature(), category_feature(), number_feature(), sequence_feature(), set_feature(), text_feature(), vector_feature(), ] # validate config with all features config = { "input_features": all_input_features, "output_features": all_output_features, } validate_config(config) # make sure all defaults provided also registers as valid config = merge_with_defaults(config) validate_config(config) # test various invalid output features input_only_features = [ feature for feature in all_input_features if feature["type"] not in output_type_registry.keys() ] for input_feature in input_only_features: config = { "input_features": all_input_features, "output_features": all_output_features + [input_feature], } dtype = input_feature["type"] with pytest.raises(ValidationError, match=rf"^'{dtype}' is not one of .*"): validate_config(config)
def test_config_features(): all_input_features = [ audio_feature('/tmp/destination_folder'), bag_feature(), binary_feature(), category_feature(), date_feature(), h3_feature(), image_feature('/tmp/destination_folder'), numerical_feature(), sequence_feature(), set_feature(), text_feature(), timeseries_feature(), vector_feature(), ] all_output_features = [ binary_feature(), category_feature(), numerical_feature(), sequence_feature(), set_feature(), text_feature(), vector_feature(), ] # validate config with all features config = { 'input_features': all_input_features, 'output_features': all_output_features, } validate_config(config) # make sure all defaults provided also registers as valid config = merge_with_defaults(config) validate_config(config) # test various invalid output features input_only_features = [ feature for feature in all_input_features if feature['type'] not in OUTPUT_FEATURE_TYPES ] for input_feature in input_only_features: config = { 'input_features': all_input_features, 'output_features': all_output_features + [input_feature], } dtype = input_feature['type'] with pytest.raises(ValidationError, match=rf"^'{dtype}' is not one of .*"): validate_config(config)
def run_hyperopt_executor(sampler, executor, csv_filename, validate_output_feature=False, validation_metric=None): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum")] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "num_fc_layers": 2}, "training": {"epochs": 2, "learning_rate": 0.001}, "hyperopt": { **HYPEROPT_CONFIG, "executor": executor, "sampler": sampler, }, } config = merge_with_defaults(config) hyperopt_config = config["hyperopt"] if validate_output_feature: hyperopt_config['output_feature'] = output_features[0]['name'] if validation_metric: hyperopt_config['validation_metric'] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] if sampler.get("search_alg", {}).get("type", "") == 'bohb': # bohb does not support grid_search search space del parameters['utterance.cell_type'] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler( sampler["type"])(goal, parameters, **sampler) hyperopt_executor = get_build_hyperopt_executor(executor["type"])( hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.execute(config, dataset=rel_path)
def run_hyperopt_executor( search_alg, executor, csv_filename, tmpdir, validate_output_feature=False, validation_metric=None, use_split=True, ): config = _get_config(search_alg, executor) rel_path = generate_data(config["input_features"], config["output_features"], csv_filename) if not use_split: df = pd.read_csv(rel_path) df["split"] = 0 df.to_csv(rel_path) config = merge_with_defaults(config) hyperopt_config = config["hyperopt"] if validate_output_feature: hyperopt_config["output_feature"] = config["output_features"][0]["name"] if validation_metric: hyperopt_config["validation_metric"] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] if search_alg.get("type", "") == "bohb": # bohb does not support grid_search search space del parameters["utterance.cell_type"] hyperopt_config["parameters"] = parameters split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] search_alg = hyperopt_config["search_alg"] hyperopt_executor = get_build_hyperopt_executor(executor["type"])( parameters, output_feature, metric, goal, split, search_alg=search_alg, **executor ) hyperopt_executor.execute( config, dataset=rel_path, output_directory=tmpdir, backend="local", )
def test_overwrite_trainer_type(): expected_trainer_type = "ray_legacy_trainer" config = { INPUT_FEATURES: [category_feature()], OUTPUT_FEATURES: [category_feature()], MODEL_TYPE: MODEL_ECD, "trainer": { "type": expected_trainer_type }, } merged_config = merge_with_defaults(config) assert merged_config[TRAINER][TYPE] == expected_trainer_type
def run_hyperopt_executor( sampler, executor, csv_filename, ray_mock_dir, validate_output_feature=False, validation_metric=None, ): config = _get_config(sampler, executor) csv_filename = os.path.join(ray_mock_dir, 'dataset.csv') dataset_csv = generate_data( config['input_features'], config['output_features'], csv_filename, num_examples=100) dataset_parquet = create_data_set_to_use('parquet', dataset_csv) config = merge_with_defaults(config) hyperopt_config = config["hyperopt"] if validate_output_feature: hyperopt_config['output_feature'] = config['output_features'][0]['name'] if validation_metric: hyperopt_config['validation_metric'] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] if sampler.get("search_alg", {}).get("type", "") == 'bohb': # bohb does not support grid_search search space del parameters['combiner.num_steps'] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler( sampler["type"])(goal, parameters, **sampler) hyperopt_executor = MockRayTuneExecutor( hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.mock_path = os.path.join(ray_mock_dir, "bucket") hyperopt_executor.execute( config, dataset=dataset_parquet, backend=RayBackend(processor={'parallelism': 4,}), output_directory=ray_mock_dir, skip_save_processed_input=True, skip_save_unprocessed_output=True )
def test_hyperopt_executor(csv_filename): input_features = [ text_feature(name="utterance", cell_type="lstm", reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum") ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) model_definition = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "num_fc_layers": 2 }, "training": { "epochs": 2, "learning_rate": 0.001 } } model_definition = merge_with_defaults(model_definition) input_features = model_definition["input_features"] output_features = model_definition["output_features"] hyperopt_config = HYPEROPT_CONFIG update_hyperopt_params_with_defaults(hyperopt_config) strategy = hyperopt_config["strategy"] parameters = hyperopt_config["parameters"] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_strategy = get_build_hyperopt_strategy(strategy["type"])( goal, parameters, **strategy) for executor in EXECUTORS: hyperopt_executor = get_build_hyperopt_executor(executor["type"])( hyperopt_strategy, output_feature, metric, split, **executor) hyperopt_executor.execute(model_definition, data_csv=rel_path)
def run_hyperopt_executor( sampler, executor, csv_filename, validate_output_feature=False, validation_metric=None, ): config = _get_config(sampler, executor) rel_path = generate_data(config["input_features"], config["output_features"], csv_filename) config = merge_with_defaults(config) hyperopt_config = config["hyperopt"] if validate_output_feature: hyperopt_config["output_feature"] = config["output_features"][0][ "name"] if validation_metric: hyperopt_config["validation_metric"] = validation_metric update_hyperopt_params_with_defaults(hyperopt_config) parameters = hyperopt_config["parameters"] if sampler.get("search_alg", {}).get("type", "") == "bohb": # bohb does not support grid_search search space del parameters["utterance.cell_type"] split = hyperopt_config["split"] output_feature = hyperopt_config["output_feature"] metric = hyperopt_config["metric"] goal = hyperopt_config["goal"] hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler) hyperopt_executor = get_build_hyperopt_executor(executor["type"])( hyperopt_sampler, output_feature, metric, split, **executor) hyperopt_executor.execute( config, dataset=rel_path, backend="local", )