def test_image_resizing_num_channel_handling(csv_filename): """ This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3, 'num_processes': 5 }, fc_size=8, num_filters=8 ), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='minmax') ] output_features = [binary_feature(), numerical_feature()] rel_path = generate_data( input_features, output_features, csv_filename, num_examples=50 ) df1 = read_csv(rel_path) input_features[0]['preprocessing']['num_channels'] = 1 rel_path = generate_data( input_features, output_features, csv_filename, num_examples=50 ) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user sepcifiies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]['preprocessing']['num_channels'] # User now doesn't specify num channels. Should throw exception with pytest.raises(ValueError): run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_custom_encoder_decoder(): input_features = [ sequence_feature(reduce_output="sum"), numerical_feature(encoder="custom_numerical_encoder"), ] output_features = [ numerical_feature(decoder="custom_numerical_decoder"), ] _run_test(input_features=input_features, output_features=output_features)
def test_image_resizing_num_channel_handling(csv_filename): """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config. :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5 }, fc_size=8, num_filters=8, ), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="minmax"), ] output_features = [binary_feature(), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df1 = read_csv(rel_path) input_features[0]["preprocessing"]["num_channels"] = 1 rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user specifies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]["preprocessing"]["num_channels"] # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_kfold_cv_api_from_file(): # k-fold_cross_validate api with config_file num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, 'train.csv') config_fp = os.path.join(tmpdir, 'config.yaml') # generate synthetic data for the test input_features = [ numerical_feature(normalization='zscore'), numerical_feature(normalization='zscore') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum') ] generate_data(input_features, output_features, training_data_fp) # generate config file config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2} } with open(config_fp, 'w') as f: yaml.dump(config, f) # test kfold_cross_validate api with config file # execute k-fold cross validation run ( kfold_cv_stats, kfold_split_indices ) = kfold_cross_validate( 3, config=config_fp, dataset=training_data_fp ) # correct structure for results from kfold cv for key in ['fold_' + str(i + 1) for i in range(num_folds)] + ['overall']: assert key in kfold_cv_stats for key in ['fold_' + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_config_features(): all_input_features = [ audio_feature('/tmp/destination_folder'), bag_feature(), binary_feature(), category_feature(), date_feature(), h3_feature(), image_feature('/tmp/destination_folder'), numerical_feature(), sequence_feature(), set_feature(), text_feature(), timeseries_feature(), vector_feature(), ] all_output_features = [ binary_feature(), category_feature(), numerical_feature(), sequence_feature(), set_feature(), text_feature(), vector_feature(), ] # validate config with all features config = { 'input_features': all_input_features, 'output_features': all_output_features, } validate_config(config) # make sure all defaults provided also registers as valid config = merge_with_defaults(config) validate_config(config) # test various invalid output features input_only_features = [ feature for feature in all_input_features if feature['type'] not in OUTPUT_FEATURE_TYPES ] for input_feature in input_only_features: config = { 'input_features': all_input_features, 'output_features': all_output_features + [input_feature], } dtype = input_feature['type'] with pytest.raises(ValidationError, match=rf"^'{dtype}' is not one of .*"): validate_config(config)
def test_kfold_cv_dataset_formats(data_format): # k-fold_cross_validate api with in-memory model definition num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, 'train.csv') # generate synthetic data for the test input_features = [ numerical_feature(normalization='zscore'), numerical_feature(normalization='zscore') ] output_features = [ numerical_feature() ] generate_data(input_features, output_features, training_data_fp) dataset_to_use = create_data_set_to_use(data_format, training_data_fp) # generate model definition file model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2} } # test kfold_cross_validate api with model definition in-memory # execute k-fold cross validation run ( kfold_cv_stats, kfold_split_indices ) = kfold_cross_validate( 3, model_definition=model_definition, dataset=dataset_to_use ) # correct structure for results from kfold cv for key in ['fold_' + str(i + 1) for i in range(num_folds)] + ['overall']: assert key in kfold_cv_stats for key in ['fold_' + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_hyperopt_run_hyperopt(csv_filename, ray_start_4_cpus, ray_mock_dir): input_features = [ numerical_feature(), numerical_feature() ] output_features = [ binary_feature() ] csv_filename = os.path.join(ray_mock_dir, 'dataset.csv') dataset_csv = generate_data( input_features, output_features, csv_filename, num_examples=100) dataset_parquet = create_data_set_to_use('parquet', dataset_csv) config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "num_fc_layers": 2}, "training": {"epochs": 4, "learning_rate": 0.001} } output_feature_name = output_features[0]['name'] hyperopt_configs = { "parameters": { "training.learning_rate": { "space": "loguniform", "lower": 0.001, "upper": 0.1, }, output_feature_name + ".fc_size": { "space": "randint", "lower": 32, "upper": 256 }, output_feature_name + ".num_fc_layers": { "space": "randint", "lower": 2, "upper": 6 } }, "goal": "minimize", 'output_feature': output_feature_name, 'validation_metrics': 'loss', 'executor': {'type': 'ray'}, 'sampler': {'type': 'ray', 'num_samples': 2}, 'backend': {'type': 'ray', 'processor': {'parallelism': 4}} } # add hyperopt parameter space to the config config['hyperopt'] = hyperopt_configs run_hyperopt(config, dataset_parquet, ray_mock_dir)
def test_server_integration(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3 }, fc_size=16, num_filters=8), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [category_feature(vocab_size=2), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename) model, output_dir = train_model(input_features, output_features, data_csv=rel_path) app = server(model) client = TestClient(app) response = client.get('/') assert response.status_code == 200 response = client.post('/predict') assert response.json() == ALL_FEATURES_PRESENT_ERROR data_df = read_csv(rel_path) first_entry = data_df.T.to_dict()[0] data, files = convert_to_form(first_entry) server_response = client.post('/predict', data=data, files=files) server_response = server_response.json() server_response_keys = sorted(list(server_response.keys())) assert server_response_keys == sorted(output_keys_for(output_features)) model_output, _ = model.predict(dataset=[first_entry], data_format=dict) model_output = model_output.to_dict('records')[0] assert model_output == server_response shutil.rmtree(output_dir, ignore_errors=True) shutil.rmtree(image_dest_folder)
def test_experiment_dataset_formats(data_format): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting input_features = [numerical_feature(), category_feature()] output_features = [category_feature(), numerical_feature()] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'preprocessing': {}, 'training': { 'epochs': 2 } } # create temporary name for train and test data sets csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv' # setup training data format to test raw_data = generate_data(input_features, output_features, csv_filename) training_set_metadata = None if data_format == 'hdf5': # hdf5 format training_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=raw_data) dataset_to_use = training_set.data_hdf5_fp else: dataset_to_use = create_data_set_to_use(data_format, raw_data) # define Ludwig model model = LudwigModel(config=config) model.train(dataset=dataset_to_use, training_set_metadata=training_set_metadata, random_seed=default_random_seed) # # run functions with the specified data format model.evaluate(dataset=dataset_to_use) model.predict(dataset=dataset_to_use) # Delete the temporary data created delete_temporary_data(csv_filename)
def test_confidence_thresholding_2thresholds_3d_vis_api(csv_filename): """Ensure pdf and png figures can be saved via visualization API call. :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename :return: None """ input_features = [ text_feature(vocab_size=10, min_len=1, encoder='stacked_cnn'), numerical_feature(), category_feature(vocab_size=10, embedding_size=5), set_feature(), sequence_feature(vocab_size=10, max_len=10, encoder='embed') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), category_feature(vocab_size=2, reduce_input='sum') ] encoder = 'parallel_cnn' # Generate test data data_csv = generate_data(input_features, output_features, csv_filename) input_features[0]['encoder'] = encoder model = run_api_experiment(input_features, output_features) test_df, train_df, val_df = obtain_df_splits(data_csv) _, _, output_dir = model.train(training_set=train_df, validation_set=val_df) test_stats, predictions, _ = model.evaluate(dataset=test_df, collect_predictions=True, output_directory=output_dir) output_feature_name1 = output_features[0]['name'] output_feature_name2 = output_features[1]['name'] # probabilities need to be list of lists containing each row data from the # probability columns ref: https://ludwig-ai.github.io/ludwig-docs/api/#test - Return probability1 = predictions.iloc[:, [2, 3, 4]].values probability2 = predictions.iloc[:, [7, 8, 9]].values ground_truth_metadata = model.training_set_metadata target_predictions1 = test_df[output_feature_name1] target_predictions2 = test_df[output_feature_name2] ground_truth1 = np.asarray([ ground_truth_metadata[output_feature_name1]['str2idx'][prediction] for prediction in target_predictions1 ]) ground_truth2 = np.asarray([ ground_truth_metadata[output_feature_name2]['str2idx'][prediction] for prediction in target_predictions2 ]) viz_outputs = ('pdf', 'png') for viz_output in viz_outputs: vis_output_pattern_pdf = os.path.join(output_dir, '*.{}'.format(viz_output)) visualize.confidence_thresholding_2thresholds_3d( [probability1, probability2], [ground_truth1, ground_truth2], [output_feature_name1, output_feature_name2], labels_limit=0, output_directory=output_dir, file_format=viz_output) figure_cnt = glob.glob(vis_output_pattern_pdf) assert 1 == len(figure_cnt) shutil.rmtree(output_dir, ignore_errors=True)
def _run_test(input_features=None, output_features=None, combiner=None): with tempfile.TemporaryDirectory() as tmpdir: input_features = input_features or [ sequence_feature(reduce_output="sum"), numerical_feature(), ] output_features = output_features or [category_feature(vocab_size=2, reduce_input="sum")] combiner = combiner or {"type": "concat"} csv_filename = os.path.join(tmpdir, "training.csv") data_csv = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": combiner, "training": {"epochs": 2}, } model = LudwigModel(config, backend=LocalTestBackend()) _, _, output_directory = model.train( dataset=data_csv, output_directory=tmpdir, ) model.predict(dataset=data_csv, output_directory=output_directory)
def generate_output_features_with_dependencies(main_feature, dependencies): # helper function to generate multiple output features specifications # with dependencies, support for 'test_experiment_multiple_seq_seq` unit test # Parameters: # main_feature: feature identifier, valid values 'feat1', 'feat2', 'feat3' # dependencies: list of dependencies for 'main_feature', do not li # Example: # generate_output_features_with_dependencies('feat2', ['feat1', 'feat3']) output_features = [ category_feature(vocab_size=2, reduce_input='sum'), sequence_feature(vocab_size=10, max_len=5), numerical_feature() ] # value portion of dictionary is a tuple: (position, feature_name) # position: location of output feature in the above output_features list # feature_name: Ludwig generated feature name feature_names = { 'feat1': (0, output_features[0]['name']), 'feat2': (1, output_features[1]['name']), 'feat3': (2, output_features[2]['name']) } # generate list of dependencies with real feature names generated_dependencies = [ feature_names[feat_name][1] for feat_name in dependencies ] # specify dependencies for the main_feature output_features[feature_names[main_feature][0]]['dependencies'] = \ generated_dependencies return output_features
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler): all_input_features = [ binary_feature(), category_feature(), numerical_feature(), text_feature(), ] all_output_features = [ category_feature(), sequence_feature(), vector_feature(), ] # validate config with all features config = { "input_features": all_input_features, "output_features": all_output_features, HYPEROPT: HYPEROPT_CONFIG, } config = copy.deepcopy(config) if use_train: config[TRAINING] = {"batch_size": "42"} if use_hyperopt_scheduler: # hyperopt scheduler cannot be used with early stopping config[HYPEROPT]["sampler"]["scheduler"] = SCHEDULER merged_config = merge_with_defaults(config) expected = -1 if use_hyperopt_scheduler else default_early_stop assert merged_config[TRAINING]["early_stop"] == expected
def test_validate_with_preprocessing_defaults(): config = { "input_features": [ audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults), bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults), binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults), category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults), date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults), h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults), image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults), numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults), sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults), set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults), text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults), timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults), vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults), ], "output_features": [{"name": "target", "type": "category"}], "training": { "decay": True, "learning_rate": 0.001, "validation_field": "target", "validation_metric": "accuracy", }, } validate_config(config) config = merge_with_defaults(config) validate_config(config)
def test_experiment_image_inputs(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 12, 'width': 12, 'num_channels': 3, 'num_processes': 5 }, fc_size=16, num_filters=8 ), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Stacked CNN encoder input_features[0]['encoder'] = 'stacked_cnn' rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Stacked CNN encoder, in_memory = False input_features[0]['preprocessing']['in_memory'] = False rel_path = generate_data(input_features, output_features, csv_filename) run_experiment( input_features, output_features, data_csv=rel_path, skip_save_processed_input=False, ) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_ray_tabular(): input_features = [ sequence_feature(reduce_output="sum"), category_feature(vocab_size=2, reduce_input="sum"), numerical_feature(normalization="zscore"), set_feature(), binary_feature(), bag_feature(), vector_feature(), h3_feature(), date_feature(), ] output_features = [ binary_feature(), numerical_feature(normalization="zscore"), ] run_test_parquet(input_features, output_features)
def test_multiple_dependencies(reduce_dependencies, hidden_shape, dependent_hidden_shape, dependent_hidden_shape2): # setup at least for a single dependency hidden_layer = tf.random.normal(hidden_shape, dtype=tf.float32) other_hidden_layer = tf.random.normal(dependent_hidden_shape, dtype=tf.float32) other_dependencies = { 'feature_name': other_hidden_layer, } # setup dummy output feature to be root of dependency list num_feature_defn = numerical_feature() num_feature_defn['loss'] = {'type': 'mean_squared_error'} num_feature_defn['dependencies'] = ['feature_name'] if len(dependent_hidden_shape) > 2: num_feature_defn['reduce_dependencies'] = reduce_dependencies # Based on specification calculate expected resulting hidden size for # with one dependencies if reduce_dependencies == 'concat' and len(hidden_shape) == 2 and \ len(dependent_hidden_shape) == 3: expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE * SEQ_SIZE else: expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE # set up if multiple dependencies specified, setup second dependent feature if dependent_hidden_shape2: other_hidden_layer2 = tf.random.normal(dependent_hidden_shape2, dtype=tf.float32) other_dependencies['feature_name2'] = other_hidden_layer2 num_feature_defn['dependencies'].append('feature_name2') if len(dependent_hidden_shape2) > 2: num_feature_defn['reduce_dependencies'] = reduce_dependencies # Based on specification calculate marginal increase in resulting # hidden size with two dependencies if reduce_dependencies == 'concat' and len(hidden_shape) == 2 and \ len(dependent_hidden_shape2) == 3: expected_hidden_size += dependent_hidden_shape2[-1] * SEQ_SIZE else: expected_hidden_size += dependent_hidden_shape2[-1] # test dependency concatenation out_feature = NumericalOutputFeature(num_feature_defn) results = out_feature.concat_dependencies(hidden_layer, other_dependencies) # confirm size of resutling concat_dependencies() call if len(hidden_shape) > 2: assert results.shape.as_list() == \ [BATCH_SIZE, SEQ_SIZE, expected_hidden_size] else: assert results.shape.as_list() == [BATCH_SIZE, expected_hidden_size] del (out_feature)
def test_ray_tabular(): input_features = [ sequence_feature(reduce_output='sum'), numerical_feature(normalization='zscore'), set_feature(), binary_feature(), bag_feature(), vector_feature(), h3_feature(), date_feature(), ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), binary_feature(), set_feature(max_len=3, vocab_size=5), numerical_feature(normalization='zscore'), vector_feature(), ] run_test_parquet(input_features, output_features)
def test_dask_split(): input_features = [ numerical_feature(normalization='zscore'), set_feature(), binary_feature(), ] output_features = [category_feature(vocab_size=2, reduce_input='sum')] run_test_parquet(input_features, output_features, run_fn=run_split_api_experiment)
def _get_config(sampler, executor): input_features = [ numerical_feature(), numerical_feature() ] output_features = [ binary_feature() ] return { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "num_fc_layers": 2}, "training": {"epochs": 2, "learning_rate": 0.001}, "hyperopt": { **HYPEROPT_CONFIG, "executor": executor, "sampler": sampler, }, }
def test_server_integration(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3 }, fc_size=16, num_filters=8 ), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='zscore') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) model = train_model(input_features, output_features, data_csv=rel_path) app = server(model) client = TestClient(app) response = client.post('/predict') assert response.json() == ALL_FEATURES_PRESENT_ERROR data_df = read_csv(rel_path) data, files = convert_to_form(data_df.T.to_dict()[0]) response = client.post('/predict', data=data, files=files) response_keys = sorted(list(response.json().keys())) assert response_keys == sorted(output_keys_for(output_features)) shutil.rmtree(model.exp_dir_name, ignore_errors=True) shutil.rmtree(image_dest_folder)
def test_multiple_dependencies(reduce_dependencies, hidden_shape, dependent_hidden_shape, dependent_hidden_shape2): # setup at least for a single dependency hidden_layer = torch.randn(hidden_shape, dtype=torch.float32) other_hidden_layer = torch.randn(dependent_hidden_shape, dtype=torch.float32) other_dependencies = { "feature_name": other_hidden_layer, } # setup dummy output feature to be root of dependency list num_feature_defn = numerical_feature() num_feature_defn["loss"] = {"type": "mean_squared_error"} num_feature_defn["dependencies"] = ["feature_name"] if len(dependent_hidden_shape) > 2: num_feature_defn["reduce_dependencies"] = reduce_dependencies # Based on specification calculate expected resulting hidden size for # with one dependencies if reduce_dependencies == "concat" and len(hidden_shape) == 2 and len( dependent_hidden_shape) == 3: expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE * SEQ_SIZE else: expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE # set up if multiple dependencies specified, setup second dependent feature if dependent_hidden_shape2: other_hidden_layer2 = torch.randn(dependent_hidden_shape2, dtype=torch.float32) other_dependencies["feature_name2"] = other_hidden_layer2 num_feature_defn["dependencies"].append("feature_name2") if len(dependent_hidden_shape2) > 2: num_feature_defn["reduce_dependencies"] = reduce_dependencies # Based on specification calculate marginal increase in resulting # hidden size with two dependencies if reduce_dependencies == "concat" and len(hidden_shape) == 2 and len( dependent_hidden_shape2) == 3: expected_hidden_size += dependent_hidden_shape2[-1] * SEQ_SIZE else: expected_hidden_size += dependent_hidden_shape2[-1] # test dependency concatenation num_feature_defn["input_size"] = expected_hidden_size out_feature = NumericalOutputFeature(num_feature_defn) results = out_feature.concat_dependencies(hidden_layer, other_dependencies) # confirm size of resutling concat_dependencies() call if len(hidden_shape) > 2: assert results.shape == (BATCH_SIZE, SEQ_SIZE, expected_hidden_size) else: assert results.shape == (BATCH_SIZE, expected_hidden_size) del out_feature
def test_validation_metrics(test_case: TestCase, csv_filename: str): # setup test scenarios test_scenarios = [] for output_feature in test_case.output_features: # single output feature capture feature specific metrics of_name = output_feature[NAME] for metric in test_case.validation_metrics: test_scenarios.append((of_name, metric)) if len(test_case.output_features) == 1: # it shoudl work when there's only one output feature # and the metric applyys to the output feature type, # the output feature name should be replacing combined # and a warning should be printed about the substitution test_scenarios.append(('combined', metric)) # add standard test for combined test_scenarios.append(('combined', 'loss')) # setup features for the test input_features = [numerical_feature(), category_feature(), binary_feature()] output_features = test_case.output_features # generate training data training_data = generate_data( input_features, output_features, filename=csv_filename ) # loop through scenarios for validation_field, validation_metric in test_scenarios: # setup config config = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 3, 'validation_field': validation_field, 'validation_metric': validation_metric } } model = LudwigModel(config) model.train( dataset=training_data, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_log=True, skip_save_model=True, skip_save_processed_input=True, skip_save_progress=True )
def test_experiment_multiple_seq_seq(csv_filename, output_features): input_features = [ text_feature(vocab_size=100, min_len=1, encoder='stacked_cnn'), numerical_feature(normalization='zscore'), category_feature(vocab_size=10, embedding_size=5), set_feature(), sequence_feature(vocab_size=10, max_len=10, encoder='embed') ] output_features = output_features rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_infer_image_metadata(csv_filename: str): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, encoder="stacked_cnn", fc_size=16, num_filters=8), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="zscore"), ] output_features = [category_feature(vocab_size=2, reduce_input="sum"), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename) # remove image preprocessing section to force inferring image meta data input_features[0].pop("preprocessing") run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_kfold_cv_api_from_file(): # k-fold_cross_validate api with config_file num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, "train.csv") config_fp = os.path.join(tmpdir, "config.yaml") # generate synthetic data for the test input_features = [numerical_feature(normalization="zscore"), numerical_feature(normalization="zscore")] output_features = [category_feature(vocab_size=3, reduce_input="sum")] generate_data(input_features, output_features, training_data_fp) # generate config file config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 14}, "training": {"epochs": 2}, } with open(config_fp, "w") as f: yaml.dump(config, f) # test kfold_cross_validate api with config file # execute k-fold cross validation run (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config_fp, dataset=training_data_fp) # correct structure for results from kfold cv for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]: assert key in kfold_cv_stats for key in ["fold_" + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_experiment_image_inputs(image_params: ImageParams, csv_filename: str): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 12, "width": 12, "num_channels": 3, "num_processes": 5 }, fc_size=16, num_filters=8, ), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="zscore"), ] output_features = [ category_feature(vocab_size=2, reduce_input="sum"), numerical_feature() ] input_features[0]["encoder"] = image_params.image_encoder input_features[0]["preprocessing"][ "in_memory"] = image_params.in_memory_flag rel_path = generate_data(input_features, output_features, csv_filename) run_experiment( input_features, output_features, dataset=rel_path, skip_save_processed_input=image_params.skip_save_processed_input, ) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_ray_split(): input_features = [ numerical_feature(normalization="zscore"), set_feature(), binary_feature(), ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] run_test_parquet( input_features, output_features, run_fn=run_split_api_experiment, num_cpus=4, )
def test_kfold_cv_dataset_formats(data_format): # k-fold_cross_validate api with in-memory config num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, "train.csv") # generate synthetic data for the test input_features = [numerical_feature(normalization="zscore"), numerical_feature(normalization="zscore")] output_features = [numerical_feature()] generate_data(input_features, output_features, training_data_fp) dataset_to_use = create_data_set_to_use(data_format, training_data_fp) # generate config file config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 14}, "training": {"epochs": 2}, } # test kfold_cross_validate api with config in-memory # execute k-fold cross validation run (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config, dataset=dataset_to_use) # correct structure for results from kfold cv for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]: assert key in kfold_cv_stats for key in ["fold_" + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_experiment_multiple_seq_seq(csv_filename): # Multiple inputs, Multiple outputs input_features = [ text_feature(vocab_size=100, min_len=1, encoder='stacked_cnn'), numerical_feature(), categorical_feature(vocab_size=10, embedding_size=5), set_feature(), sequence_feature(vocab_size=10, max_len=10, encoder='embed') ] output_features = [ categorical_feature(vocab_size=2, reduce_input='sum'), sequence_feature(vocab_size=10, max_len=5), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Use generator as decoder output_features = [ categorical_feature(vocab_size=2, reduce_input='sum'), sequence_feature(vocab_size=10, max_len=5, decoder='generator'), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path) # Generator decoder and reduce_input = None output_features = [ categorical_feature(vocab_size=2, reduce_input='sum'), sequence_feature(max_len=5, decoder='generator', reduce_input=None), numerical_feature() ] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path)