def test_missing_values_drop_rows(csv_filename, tmpdir): data_csv_path = os.path.join(tmpdir, csv_filename) kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}} input_features = [ number_feature(), binary_feature(), category_feature(vocab_size=3), ] output_features = [ binary_feature(**kwargs), number_feature(**kwargs), category_feature(vocab_size=3, **kwargs), sequence_feature(vocab_size=3, **kwargs), text_feature(vocab_size=3, **kwargs), set_feature(vocab_size=3, **kwargs), vector_feature(), ] backend = LocalTestBackend() config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}} training_data_csv_path = generate_data(input_features, output_features, data_csv_path) df = read_csv_with_nan(training_data_csv_path, nan_percent=0.1) # run preprocessing ludwig_model = LudwigModel(config, backend=backend) ludwig_model.preprocess(dataset=df)
def test_torchscript_e2e_tabnet_combiner(csv_filename, tmpdir): data_csv_path = os.path.join(tmpdir, csv_filename) # Configure features to be tested: input_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3), bag_feature(vocab_size=3), set_feature(vocab_size=3), ] output_features = [ binary_feature(), number_feature(), category_feature(vocab_size=3), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, COMBINER: { "type": "tabnet", "num_total_blocks": 2, "num_shared_blocks": 2, }, TRAINER: { "epochs": 2 }, } # Generate training data training_data_csv_path = generate_data(input_features, output_features, data_csv_path) validate_torchscript_outputs(tmpdir, config, backend, training_data_csv_path)
def test_experiment_infer_image_metadata(tmpdir): # Image Inputs image_dest_folder = os.path.join(tmpdir, "generated_images") # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, encoder="stacked_cnn", output_size=16, num_filters=8), text_feature(encoder="embed", min_len=1), number_feature(normalization="zscore"), ] output_features = [ category_feature(vocab_size=2, reduce_input="sum"), number_feature() ] rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv")) # remove image preprocessing section to force inferring image meta data input_features[0].pop("preprocessing") run_experiment(input_features, output_features, dataset=rel_path)
def test_custom_encoder_decoder(): input_features = [ sequence_feature(reduce_output="sum"), number_feature(encoder="custom_number_encoder"), ] output_features = [ number_feature(decoder="custom_number_decoder"), ] _run_test(input_features=input_features, output_features=output_features)
def test_image_resizing_num_channel_handling(tmpdir): """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config. :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(tmpdir, "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5 }, output_size=8, num_filters=8, ), text_feature(encoder="embed", min_len=1), number_feature(normalization="minmax"), ] output_features = [binary_feature(), number_feature()] rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset1.csv"), num_examples=50) df1 = read_csv(rel_path) input_features[0]["preprocessing"]["num_channels"] = 1 rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset2.csv"), num_examples=50) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user specifies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]["preprocessing"]["num_channels"] # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path)
def test_torchscript_e2e_tabular(csv_filename, tmpdir): data_csv_path = os.path.join(tmpdir, csv_filename) # Configure features to be tested: bin_str_feature = binary_feature() transformed_number_features = [ number_feature(preprocessing={"normalization": numeric_transformer}) for numeric_transformer in numeric_transformation_registry.keys() ] input_features = [ bin_str_feature, binary_feature(), *transformed_number_features, category_feature(vocab_size=3), bag_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), # TODO: future support # date_feature(), # h3_feature(), ] output_features = [ bin_str_feature, binary_feature(), number_feature(), category_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), sequence_feature(vocab_size=3), text_feature(vocab_size=3), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } # Generate training data training_data_csv_path = generate_data(input_features, output_features, data_csv_path) # Convert bool values to strings, e.g., {'Yes', 'No'} df = pd.read_csv(training_data_csv_path) false_value, true_value = "No", "Yes" df[bin_str_feature[NAME]] = df[bin_str_feature[NAME]].map( lambda x: true_value if x else false_value) df.to_csv(training_data_csv_path) validate_torchscript_outputs(tmpdir, config, backend, training_data_csv_path)
def test_kfold_cv_api_from_file(): # k-fold_cross_validate api with config file num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, "train.csv") config_fp = os.path.join(tmpdir, "config.yaml") # generate synthetic data for the test input_features = [ number_feature(normalization="zscore"), number_feature(normalization="zscore") ] output_features = [category_feature(vocab_size=3, reduce_input="sum")] generate_data(input_features, output_features, training_data_fp) # generate config file config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } with open(config_fp, "w") as f: yaml.dump(config, f) # test kfold_cross_validate api with config file # execute k-fold cross validation run (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config_fp, dataset=training_data_fp) # correct structure for results from kfold cv for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]: assert key in kfold_cv_stats for key in ["fold_" + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_config_features(): all_input_features = [ audio_feature("/tmp/destination_folder"), bag_feature(), binary_feature(), category_feature(), date_feature(), h3_feature(), image_feature("/tmp/destination_folder"), number_feature(), sequence_feature(), set_feature(), text_feature(), timeseries_feature(), vector_feature(), ] all_output_features = [ binary_feature(), category_feature(), number_feature(), sequence_feature(), set_feature(), text_feature(), vector_feature(), ] # validate config with all features config = { "input_features": all_input_features, "output_features": all_output_features, } validate_config(config) # make sure all defaults provided also registers as valid config = merge_with_defaults(config) validate_config(config) # test various invalid output features input_only_features = [ feature for feature in all_input_features if feature["type"] not in output_type_registry.keys() ] for input_feature in input_only_features: config = { "input_features": all_input_features, "output_features": all_output_features + [input_feature], } dtype = input_feature["type"] with pytest.raises(ValidationError, match=rf"^'{dtype}' is not one of .*"): validate_config(config)
def test_kfold_cv_dataset_formats(data_format): # k-fold_cross_validate api with in-memory config num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, "train.csv") # generate synthetic data for the test input_features = [ number_feature(normalization="zscore"), number_feature(normalization="zscore") ] output_features = [number_feature()] generate_data(input_features, output_features, training_data_fp) dataset_to_use = create_data_set_to_use(data_format, training_data_fp) # generate config file config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } # test kfold_cross_validate api with config in-memory # execute k-fold cross validation run (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config, dataset=dataset_to_use) # correct structure for results from kfold cv for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]: assert key in kfold_cv_stats for key in ["fold_" + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def _get_config(sampler, executor): input_features = [number_feature(), number_feature()] output_features = [binary_feature()] return { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "num_fc_layers": 2}, TRAINER: {"epochs": 2, "learning_rate": 0.001}, "hyperopt": { **HYPEROPT_CONFIG, "executor": executor, "sampler": sampler, }, }
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler): all_input_features = [ binary_feature(), category_feature(), number_feature(), text_feature(), ] all_output_features = [ category_feature(), sequence_feature(), vector_feature(), ] # validate config with all features config = { INPUT_FEATURES: all_input_features, OUTPUT_FEATURES: all_output_features, HYPEROPT: HYPEROPT_CONFIG, } config = copy.deepcopy(config) if use_train: config[TRAINER] = {"batch_size": 42} if use_hyperopt_scheduler: # hyperopt scheduler cannot be used with early stopping config[HYPEROPT][EXECUTOR][SCHEDULER] = SCHEDULER_DICT merged_config = merge_with_defaults(config) expected = -1 if use_hyperopt_scheduler else ECDTrainerConfig().early_stop assert merged_config[TRAINER]["early_stop"] == expected
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler): all_input_features = [ binary_feature(), category_feature(), number_feature(), text_feature(), ] all_output_features = [ category_feature(), sequence_feature(), vector_feature(), ] # validate config with all features config = { "input_features": all_input_features, "output_features": all_output_features, HYPEROPT: HYPEROPT_CONFIG, } config = copy.deepcopy(config) if use_train: config[TRAINER] = {"batch_size": "42"} if use_hyperopt_scheduler: # hyperopt scheduler cannot be used with early stopping config[HYPEROPT]["sampler"]["scheduler"] = SCHEDULER merged_config = merge_with_defaults(config) expected = -1 if use_hyperopt_scheduler else default_early_stop assert merged_config[TRAINER]["early_stop"] == expected
def _run_test(input_features=None, output_features=None, combiner=None): with tempfile.TemporaryDirectory() as tmpdir: input_features = input_features or [ sequence_feature(reduce_output="sum"), number_feature(), ] output_features = output_features or [ category_feature(vocab_size=2, reduce_input="sum") ] combiner = combiner or {"type": "concat"} csv_filename = os.path.join(tmpdir, "training.csv") data_csv = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": combiner, TRAINER: { "epochs": 2 }, } model = LudwigModel(config, backend=LocalTestBackend()) _, _, output_directory = model.train( dataset=data_csv, output_directory=tmpdir, ) model.predict(dataset=data_csv, output_directory=output_directory)
def test_empty_split_error(backend, tmpdir): """Tests that an error is raised if one or more of the splits is empty after preprocessing.""" data_csv_path = os.path.join(tmpdir, "data.csv") out_feat = binary_feature() input_features = [number_feature()] output_features = [out_feat] config = { "input_features": input_features, "output_features": output_features } training_data_csv_path = generate_data(input_features, output_features, data_csv_path) df = pd.read_csv(training_data_csv_path) # Convert all the output features rows to null. Because the default missing value strategy is to drop empty output # rows, this will result in the dataset being empty after preprocessing. df[out_feat[COLUMN]] = None with init_backend(backend): ludwig_model = LudwigModel(config, backend=backend) with pytest.raises(ValueError, match="Dataset is empty following preprocessing"): ludwig_model.preprocess(dataset=df)
def test_config_trainer_bad_optimizer(): config = { "input_features": [ category_feature(vocab_size=2, reduce_input="sum"), number_feature(), ], "output_features": [binary_feature(weight_regularization=None)], "combiner": { "type": "tabnet", }, TRAINER: {}, } validate_config(config) # Test manually set-to-null optimizer vs unspecified: config[TRAINER]["optimizer"] = None with pytest.raises(ValidationError): validate_config(config) assert ECDTrainerConfig.Schema().load({}).optimizer is not None # Test all types in optimizer_registry supported: for key in optimizer_registry.keys(): config[TRAINER]["optimizer"] = {"type": key} validate_config(config) # Test invalid optimizer type: config[TRAINER]["optimizer"] = {"type": 0} with pytest.raises(ValidationError): validate_config(config) config[TRAINER]["optimizer"] = {"type": {}} with pytest.raises(ValidationError): validate_config(config) config[TRAINER]["optimizer"] = {"type": "invalid"} with pytest.raises(ValidationError): validate_config(config)
def test_number_feature_wrong_dtype(csv_filename, tmpdir): """Tests that a number feature with all string values is treated as having missing values by default.""" data_csv_path = os.path.join(tmpdir, csv_filename) num_feat = number_feature() input_features = [num_feat] output_features = [binary_feature()] config = { "input_features": input_features, "output_features": output_features } training_data_csv_path = generate_data(input_features, output_features, data_csv_path) df = pd.read_csv(training_data_csv_path) # convert numbers to random strings def random_string(): letters = string.ascii_lowercase return "".join(random.choice(letters) for _ in range(10)) df[num_feat[COLUMN]] = df[num_feat[COLUMN]].apply( lambda _: random_string()) # run preprocessing backend = LocalTestBackend() ludwig_model = LudwigModel(config, backend=backend) train_ds, val_ds, test_ds, _ = ludwig_model.preprocess(dataset=df) concatenated_df = concatenate_df(train_ds.to_df(), val_ds.to_df(), test_ds.to_df(), backend) # check that train_ds had invalid values replaced with the missing value assert len(concatenated_df) == len(df) assert np.all(concatenated_df[num_feat[PROC_COLUMN]] == 0.0)
def test_multiple_dependencies(reduce_dependencies, hidden_shape, dependent_hidden_shape, dependent_hidden_shape2): # setup at least for a single dependency hidden_layer = torch.randn(hidden_shape, dtype=torch.float32) other_hidden_layer = torch.randn(dependent_hidden_shape, dtype=torch.float32) other_dependencies = { "feature_name": other_hidden_layer, } # setup dummy output feature to be root of dependency list num_feature_defn = number_feature() num_feature_defn["loss"] = {"type": "mean_squared_error"} num_feature_defn["dependencies"] = ["feature_name"] if len(dependent_hidden_shape) > 2: num_feature_defn["reduce_dependencies"] = reduce_dependencies # Based on specification calculate expected resulting hidden size for # with one dependencies if reduce_dependencies == "concat" and len(hidden_shape) == 2 and len( dependent_hidden_shape) == 3: expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE * SEQ_SIZE else: expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE # set up if multiple dependencies specified, setup second dependent feature if dependent_hidden_shape2: other_hidden_layer2 = torch.randn(dependent_hidden_shape2, dtype=torch.float32) other_dependencies["feature_name2"] = other_hidden_layer2 num_feature_defn["dependencies"].append("feature_name2") if len(dependent_hidden_shape2) > 2: num_feature_defn["reduce_dependencies"] = reduce_dependencies # Based on specification calculate marginal increase in resulting # hidden size with two dependencies if reduce_dependencies == "concat" and len(hidden_shape) == 2 and len( dependent_hidden_shape2) == 3: expected_hidden_size += dependent_hidden_shape2[-1] * SEQ_SIZE else: expected_hidden_size += dependent_hidden_shape2[-1] # Set up dependency reducers. dependency_reducers = torch.nn.ModuleDict() for feature_name in other_dependencies.keys(): dependency_reducers[feature_name] = SequenceReducer( reduce_mode=reduce_dependencies) # test dependency concatenation num_feature_defn["input_size"] = expected_hidden_size results = output_feature_utils.concat_dependencies( "num_feature", num_feature_defn["dependencies"], dependency_reducers, hidden_layer, other_dependencies) # confirm size of resulting concat_dependencies() call if len(hidden_shape) > 2: assert results.shape == (BATCH_SIZE, SEQ_SIZE, expected_hidden_size) else: assert results.shape == (BATCH_SIZE, expected_hidden_size)
def run_test_gbm_multiple_outputs(tmpdir, backend_config): """Test that an error is raised when the model is trained with multiple outputs.""" input_features = [number_feature(), category_feature(reduce_output="sum")] output_features = [ category_feature(vocab_size=3), binary_feature(), category_feature(vocab_size=3), ] csv_filename = os.path.join(tmpdir, "training.csv") dataset_filename = generate_data(input_features, output_features, csv_filename, num_examples=100) config = { MODEL_TYPE: "gbm", "input_features": input_features, "output_features": output_features, TRAINER: { "num_boost_round": 2 }, } model = LudwigModel(config, backend=backend_config) with pytest.raises(ValueError, match="Only single task currently supported"): model.train(dataset=dataset_filename, output_directory=tmpdir)
def test_model_weights_match_training(tmpdir, csv_filename): np.random.seed(1) input_features = [number_feature()] output_features = [number_feature()] output_feature_name = output_features[0][NAME] # Generate test data data_csv_path = generate_data(input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=100) config = { "input_features": input_features, "output_features": output_features, "trainer": { "epochs": 5, "batch_size": 32 }, } model = LudwigModel(config=config, ) training_stats, _, _ = model.train(training_set=data_csv_path, random_seed=1919) # generate predicitons from training data df = pd.read_csv(data_csv_path) predictions = model.predict(df) # compute loss on predictions from training data loss_function = MSELoss() loss = loss_function( torch.tensor(predictions[0][output_feature_name + "_predictions"].values), # predictions torch.tensor(df[output_feature_name].values), # target ).type(torch.float32) # get last loss value from training last_training_loss = torch.tensor( training_stats[TRAINING][output_feature_name][LOSS][-1]) # loss from predictions should match last loss value recorded during training assert torch.isclose(loss, last_training_loss), ( "Model predictions on training set did not generate same loss value as in training. " "Need to confirm that weights were correctly captured in model.")
def run_test_gbm_number(tmpdir, backend_config): """Test that the GBM model can train and predict a numerical output (regression).""" # Given a dataset with a single input feature and a single output feature, input_features = [number_feature(), category_feature(reduce_output="sum")] output_feature = number_feature() output_features = [output_feature] csv_filename = os.path.join(tmpdir, "training.csv") dataset_filename = generate_data(input_features, output_features, csv_filename, num_examples=100) config = { MODEL_TYPE: "gbm", "input_features": input_features, "output_features": output_features, TRAINER: { "num_boost_round": 2 }, } # When I train a model on the dataset, load the model from the output directory, and # predict on the dataset model = LudwigModel(config, backend=backend_config) model.train( dataset=dataset_filename, output_directory=tmpdir, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) model.load(os.path.join(tmpdir, "api_experiment_run", "model")) preds, _ = model.predict( dataset=dataset_filename, output_directory=os.path.join(tmpdir, "predictions"), ) # Then the predictions should be included in the output pred_col = preds[output_feature["name"] + "_predictions"] if backend_config["type"] == "ray": pred_col = pred_col.compute() assert pred_col.dtype == float
def test_train_gpu_load_cpu(): input_features = [ category_feature(vocab_size=2, reduce_input="sum"), number_feature(normalization="zscore"), ] output_features = [ binary_feature(), ] run_test_with_features(input_features, output_features, run_fn=_run_train_gpu_load_cpu, num_gpus=1)
def test_config_input_output_features(): config = { "input_features": [ category_feature(), number_feature(), ], "output_features": [binary_feature()], } validate_config(config)
def test_experiment_dataset_formats(data_format, csv_filename): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting input_features = [number_feature(), category_feature()] output_features = [category_feature(), number_feature()] config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, "preprocessing": {}, TRAINER: { "epochs": 2 }, } # setup training data format to test raw_data = generate_data(input_features, output_features, csv_filename) training_set_metadata = None if data_format == "hdf5": # hdf5 format training_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=raw_data) dataset_to_use = training_set.data_hdf5_fp else: dataset_to_use = create_data_set_to_use(data_format, raw_data) # define Ludwig model model = LudwigModel(config=config) model.train(dataset=dataset_to_use, training_set_metadata=training_set_metadata, random_seed=default_random_seed) # # run functions with the specified data format model.evaluate(dataset=dataset_to_use) model.predict(dataset=dataset_to_use)
def test_ray_tabular(df_engine): if df_engine == "modin" and sys.version_info < (3, 7): pytest.skip("Modin is not supported with Python 3.6 at this time") input_features = [ sequence_feature(reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum"), number_feature(normalization="zscore"), set_feature(), binary_feature(), bag_feature(), vector_feature(), h3_feature(), date_feature(), ] output_features = [ binary_feature(), number_feature(normalization="zscore"), ] run_test_parquet(input_features, output_features, df_engine=df_engine)
def test_missing_values_drop_rows(csv_filename, tmpdir): data_csv_path = os.path.join(tmpdir, csv_filename) kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}} input_features = [ number_feature(), binary_feature(), category_feature(vocab_size=3), ] output_features = [ binary_feature(**kwargs), number_feature(**kwargs), category_feature(vocab_size=3, **kwargs), sequence_feature(vocab_size=3, **kwargs), text_feature(vocab_size=3, **kwargs), set_feature(vocab_size=3, **kwargs), vector_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path) df = pd.read_csv(training_data_csv_path) # set 10% of values to NaN nan_percent = 0.1 ix = [(row, col) for row in range(df.shape[0]) for col in range(df.shape[1])] for row, col in random.sample(ix, int(round(nan_percent * len(ix)))): df.iat[row, col] = np.nan # run preprocessing ludwig_model = LudwigModel(config, backend=backend) ludwig_model.preprocess(dataset=df)
def test_experiment_multiple_seq_seq(csv_filename, output_features): input_features = [ text_feature(vocab_size=100, min_len=1, encoder="stacked_cnn"), number_feature(normalization="zscore"), category_feature(vocab_size=10, embedding_size=5), set_feature(), sequence_feature(vocab_size=10, max_len=10, encoder="embed"), ] output_features = output_features rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path)
def test_incorrect_output_features_config(): config = { "input_features": [ number_feature(), ], "output_features": [binary_feature(decoder="classifier")], } # Invalid decoder for binary output feature with pytest.raises(ValidationError): validate_config(config)
def test_ray_split(): input_features = [ number_feature(normalization="zscore"), set_feature(), binary_feature(), ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] run_test_with_features( input_features, output_features, run_fn=run_split_api_experiment, num_cpus=4, )
def test_ray_tabular(df_engine): input_features = [ sequence_feature(reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum"), number_feature(normalization="zscore"), set_feature(), binary_feature(), bag_feature(), vector_feature(), h3_feature(), date_feature(), ] output_features = [ binary_feature(bool2str=["No", "Yes"]), binary_feature(), number_feature(normalization="zscore"), ] run_test_with_features( input_features, output_features, df_engine=df_engine, )
def test_experiment_image_inputs(image_params: ImageParams, tmpdir): # Image Inputs image_dest_folder = os.path.join(tmpdir, "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 12, "width": 12, "num_channels": 3, "num_processes": 5 }, output_size=16, num_filters=8, ), text_feature(encoder="embed", min_len=1), number_feature(normalization="zscore"), ] output_features = [ category_feature(vocab_size=2, reduce_input="sum"), number_feature() ] input_features[0]["encoder"] = image_params.image_encoder input_features[0]["preprocessing"][ "in_memory"] = image_params.in_memory_flag rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv")) run_experiment( input_features, output_features, dataset=rel_path, skip_save_processed_input=image_params.skip_save_processed_input, )