Exemple #1
0
def test_image_resizing_num_channel_handling(csv_filename):
    """
    This test creates two image datasets with 3 channels and 1 channel. The
    combination of this data is used to train a model. This checks the cases
    where the user may or may not specify a number of channels in the
    config
    :param csv_filename:
    :return:
    """
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder='resnet',
            preprocessing={
                'in_memory': True,
                'height': 8,
                'width': 8,
                'num_channels': 3,
                'num_processes': 5
            },
            fc_size=8,
            num_filters=8
        ),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='minmax')
    ]
    output_features = [binary_feature(), numerical_feature()]
    rel_path = generate_data(
        input_features, output_features, csv_filename, num_examples=50
    )

    df1 = read_csv(rel_path)

    input_features[0]['preprocessing']['num_channels'] = 1
    rel_path = generate_data(
        input_features, output_features, csv_filename, num_examples=50
    )
    df2 = read_csv(rel_path)

    df = concatenate_df(df1, df2, None, LOCAL_BACKEND)
    df.to_csv(rel_path, index=False)

    # Here the user sepcifiies number of channels. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    del input_features[0]['preprocessing']['num_channels']

    # User now doesn't specify num channels. Should throw exception
    with pytest.raises(ValueError):
        run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
def test_custom_encoder_decoder():
    input_features = [
        sequence_feature(reduce_output="sum"),
        numerical_feature(encoder="custom_numerical_encoder"),
    ]
    output_features = [
        numerical_feature(decoder="custom_numerical_decoder"),
    ]
    _run_test(input_features=input_features, output_features=output_features)
def test_image_resizing_num_channel_handling(csv_filename):
    """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to
    train a model. This checks the cases where the user may or may not specify a number of channels in the config.

    :param csv_filename:
    :return:
    """
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 8,
                "width": 8,
                "num_channels": 3,
                "num_processes": 5
            },
            fc_size=8,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="minmax"),
    ]
    output_features = [binary_feature(), numerical_feature()]
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)

    df1 = read_csv(rel_path)

    input_features[0]["preprocessing"]["num_channels"] = 1
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)
    df2 = read_csv(rel_path)

    df = concatenate_df(df1, df2, None, LOCAL_BACKEND)
    df.to_csv(rel_path, index=False)

    # Here the user specifies number of channels. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    del input_features[0]["preprocessing"]["num_channels"]

    # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Exemple #4
0
def test_kfold_cv_api_from_file():
    # k-fold_cross_validate api with config_file
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, 'train.csv')
        config_fp = os.path.join(tmpdir, 'config.yaml')

        # generate synthetic data for the test
        input_features = [
            numerical_feature(normalization='zscore'),
            numerical_feature(normalization='zscore')
        ]

        output_features = [
            category_feature(vocab_size=2, reduce_input='sum')
        ]

        generate_data(input_features, output_features, training_data_fp)

        # generate config file
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {'type': 'concat', 'fc_size': 14},
            'training': {'epochs': 2}
        }

        with open(config_fp, 'w') as f:
            yaml.dump(config, f)

        # test kfold_cross_validate api with config file

        # execute k-fold cross validation run
        (
            kfold_cv_stats,
            kfold_split_indices
        ) = kfold_cross_validate(
            3,
            config=config_fp,
            dataset=training_data_fp
        )

        # correct structure for results from kfold cv
        for key in ['fold_' + str(i + 1)
                    for i in range(num_folds)] + ['overall']:
            assert key in kfold_cv_stats

        for key in ['fold_' + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
Exemple #5
0
def test_config_features():
    all_input_features = [
        audio_feature('/tmp/destination_folder'),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature('/tmp/destination_folder'),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        'input_features': all_input_features,
        'output_features': all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature['type'] not in OUTPUT_FEATURE_TYPES
    ]
    for input_feature in input_only_features:
        config = {
            'input_features': all_input_features,
            'output_features': all_output_features + [input_feature],
        }

        dtype = input_feature['type']
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
Exemple #6
0
def test_kfold_cv_dataset_formats(data_format):

    # k-fold_cross_validate api with in-memory model definition
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, 'train.csv')

        # generate synthetic data for the test
        input_features = [
            numerical_feature(normalization='zscore'),
            numerical_feature(normalization='zscore')
        ]

        output_features = [
            numerical_feature()
        ]

        generate_data(input_features, output_features, training_data_fp)
        dataset_to_use = create_data_set_to_use(data_format, training_data_fp)

        # generate model definition file
        model_definition = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {'type': 'concat', 'fc_size': 14},
            'training': {'epochs': 2}
        }

        # test kfold_cross_validate api with model definition in-memory

        # execute k-fold cross validation run
        (
            kfold_cv_stats,
            kfold_split_indices
        ) = kfold_cross_validate(
            3,
            model_definition=model_definition,
            dataset=dataset_to_use
        )

        # correct structure for results from kfold cv
        for key in ['fold_' + str(i + 1)
                    for i in range(num_folds)] + ['overall']:
            assert key in kfold_cv_stats

        for key in ['fold_' + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
Exemple #7
0
def test_hyperopt_run_hyperopt(csv_filename, ray_start_4_cpus, ray_mock_dir):
    input_features = [
        numerical_feature(), numerical_feature()
    ]
    output_features = [
        binary_feature()
    ]

    csv_filename = os.path.join(ray_mock_dir, 'dataset.csv')
    dataset_csv = generate_data(
        input_features, output_features, csv_filename, num_examples=100)
    dataset_parquet = create_data_set_to_use('parquet', dataset_csv)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "num_fc_layers": 2},
        "training": {"epochs": 4, "learning_rate": 0.001}
    }

    output_feature_name = output_features[0]['name']

    hyperopt_configs = {
        "parameters": {
            "training.learning_rate": {
                "space": "loguniform",
                "lower": 0.001,
                "upper": 0.1,
            },
            output_feature_name + ".fc_size": {
                "space": "randint",
                "lower": 32,
                "upper": 256
            },
            output_feature_name + ".num_fc_layers": {
                "space": "randint",
                "lower": 2,
                "upper": 6
            }
        },
        "goal": "minimize",
        'output_feature': output_feature_name,
        'validation_metrics': 'loss',
        'executor': {'type': 'ray'},
        'sampler': {'type': 'ray', 'num_samples': 2},
        'backend': {'type': 'ray', 'processor': {'parallelism': 4}}
    }

    # add hyperopt parameter space to the config
    config['hyperopt'] = hyperopt_configs
    run_hyperopt(config, dataset_parquet, ray_mock_dir)
Exemple #8
0
def test_server_integration(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder,
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3
                      },
                      fc_size=16,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [category_feature(vocab_size=2), numerical_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)
    model, output_dir = train_model(input_features,
                                    output_features,
                                    data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.get('/')
    assert response.status_code == 200

    response = client.post('/predict')
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)
    first_entry = data_df.T.to_dict()[0]
    data, files = convert_to_form(first_entry)
    server_response = client.post('/predict', data=data, files=files)
    server_response = server_response.json()

    server_response_keys = sorted(list(server_response.keys()))
    assert server_response_keys == sorted(output_keys_for(output_features))

    model_output, _ = model.predict(dataset=[first_entry], data_format=dict)
    model_output = model_output.to_dict('records')[0]
    assert model_output == server_response

    shutil.rmtree(output_dir, ignore_errors=True)
    shutil.rmtree(image_dest_folder)
def test_experiment_dataset_formats(data_format):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting

    input_features = [numerical_feature(), category_feature()]
    output_features = [category_feature(), numerical_feature()]

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'preprocessing': {},
        'training': {
            'epochs': 2
        }
    }

    # create temporary name for train and test data sets
    csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv'

    # setup training data format to test
    raw_data = generate_data(input_features, output_features, csv_filename)

    training_set_metadata = None

    if data_format == 'hdf5':
        # hdf5 format
        training_set, _, _, training_set_metadata = preprocess_for_training(
            config, dataset=raw_data)
        dataset_to_use = training_set.data_hdf5_fp
    else:
        dataset_to_use = create_data_set_to_use(data_format, raw_data)

    # define Ludwig model
    model = LudwigModel(config=config)
    model.train(dataset=dataset_to_use,
                training_set_metadata=training_set_metadata,
                random_seed=default_random_seed)

    # # run functions with the specified data format
    model.evaluate(dataset=dataset_to_use)
    model.predict(dataset=dataset_to_use)

    # Delete the temporary data created
    delete_temporary_data(csv_filename)
Exemple #10
0
def test_confidence_thresholding_2thresholds_3d_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [
        text_feature(vocab_size=10, min_len=1, encoder='stacked_cnn'),
        numerical_feature(),
        category_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder='embed')
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        category_feature(vocab_size=2, reduce_input='sum')
    ]
    encoder = 'parallel_cnn'
    # Generate test data
    data_csv = generate_data(input_features, output_features, csv_filename)
    input_features[0]['encoder'] = encoder
    model = run_api_experiment(input_features, output_features)
    test_df, train_df, val_df = obtain_df_splits(data_csv)
    _, _, output_dir = model.train(training_set=train_df,
                                   validation_set=val_df)
    test_stats, predictions, _ = model.evaluate(dataset=test_df,
                                                collect_predictions=True,
                                                output_directory=output_dir)

    output_feature_name1 = output_features[0]['name']
    output_feature_name2 = output_features[1]['name']
    # probabilities need to be list of lists containing each row data from the
    # probability columns ref: https://ludwig-ai.github.io/ludwig-docs/api/#test - Return
    probability1 = predictions.iloc[:, [2, 3, 4]].values
    probability2 = predictions.iloc[:, [7, 8, 9]].values

    ground_truth_metadata = model.training_set_metadata
    target_predictions1 = test_df[output_feature_name1]
    target_predictions2 = test_df[output_feature_name2]
    ground_truth1 = np.asarray([
        ground_truth_metadata[output_feature_name1]['str2idx'][prediction]
        for prediction in target_predictions1
    ])
    ground_truth2 = np.asarray([
        ground_truth_metadata[output_feature_name2]['str2idx'][prediction]
        for prediction in target_predictions2
    ])
    viz_outputs = ('pdf', 'png')
    for viz_output in viz_outputs:
        vis_output_pattern_pdf = os.path.join(output_dir,
                                              '*.{}'.format(viz_output))
        visualize.confidence_thresholding_2thresholds_3d(
            [probability1, probability2], [ground_truth1, ground_truth2],
            [output_feature_name1, output_feature_name2],
            labels_limit=0,
            output_directory=output_dir,
            file_format=viz_output)
        figure_cnt = glob.glob(vis_output_pattern_pdf)
        assert 1 == len(figure_cnt)
    shutil.rmtree(output_dir, ignore_errors=True)
def _run_test(input_features=None, output_features=None, combiner=None):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = input_features or [
            sequence_feature(reduce_output="sum"),
            numerical_feature(),
        ]
        output_features = output_features or [category_feature(vocab_size=2, reduce_input="sum")]
        combiner = combiner or {"type": "concat"}

        csv_filename = os.path.join(tmpdir, "training.csv")
        data_csv = generate_data(input_features, output_features, csv_filename)

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": combiner,
            "training": {"epochs": 2},
        }

        model = LudwigModel(config, backend=LocalTestBackend())
        _, _, output_directory = model.train(
            dataset=data_csv,
            output_directory=tmpdir,
        )
        model.predict(dataset=data_csv, output_directory=output_directory)
Exemple #12
0
def generate_output_features_with_dependencies(main_feature, dependencies):
    # helper function to generate multiple output features specifications
    # with dependencies, support for 'test_experiment_multiple_seq_seq` unit test
    # Parameters:
    # main_feature: feature identifier, valid values 'feat1', 'feat2', 'feat3'
    # dependencies: list of dependencies for 'main_feature', do not li
    # Example:
    #  generate_output_features_with_dependencies('feat2', ['feat1', 'feat3'])

    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        sequence_feature(vocab_size=10, max_len=5),
        numerical_feature()
    ]

    # value portion of dictionary is a tuple: (position, feature_name)
    #   position: location of output feature in the above output_features list
    #   feature_name: Ludwig generated feature name
    feature_names = {
        'feat1': (0, output_features[0]['name']),
        'feat2': (1, output_features[1]['name']),
        'feat3': (2, output_features[2]['name'])
    }

    # generate list of dependencies with real feature names
    generated_dependencies = [
        feature_names[feat_name][1] for feat_name in dependencies
    ]

    # specify dependencies for the main_feature
    output_features[feature_names[main_feature][0]]['dependencies'] = \
        generated_dependencies

    return output_features
Exemple #13
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINING] = {"batch_size": "42"}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT]["sampler"]["scheduler"] = SCHEDULER

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else default_early_stop
    assert merged_config[TRAINING]["early_stop"] == expected
Exemple #14
0
def test_validate_with_preprocessing_defaults():
    config = {
        "input_features": [
            audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults),
            bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults),
            binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults),
            category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults),
            date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults),
            h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults),
            image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults),
            numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults),
            sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults),
            set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults),
            text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults),
            timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults),
            vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults),
        ],
        "output_features": [{"name": "target", "type": "category"}],
        "training": {
            "decay": True,
            "learning_rate": 0.001,
            "validation_field": "target",
            "validation_metric": "accuracy",
        },
    }

    validate_config(config)
    config = merge_with_defaults(config)
    validate_config(config)
Exemple #15
0
def test_experiment_image_inputs(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder='resnet',
            preprocessing={
                'in_memory': True,
                'height': 12,
                'width': 12,
                'num_channels': 3,
                'num_processes': 5
            },
            fc_size=16,
            num_filters=8
        ),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Stacked CNN encoder
    input_features[0]['encoder'] = 'stacked_cnn'
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Stacked CNN encoder, in_memory = False
    input_features[0]['preprocessing']['in_memory'] = False
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(
        input_features,
        output_features,
        data_csv=rel_path,
        skip_save_processed_input=False,
    )

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Exemple #16
0
def test_ray_tabular():
    input_features = [
        sequence_feature(reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
        numerical_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [
        binary_feature(),
        numerical_feature(normalization="zscore"),
    ]
    run_test_parquet(input_features, output_features)
Exemple #17
0
def test_multiple_dependencies(reduce_dependencies, hidden_shape,
                               dependent_hidden_shape,
                               dependent_hidden_shape2):

    # setup at least for a single dependency
    hidden_layer = tf.random.normal(hidden_shape, dtype=tf.float32)
    other_hidden_layer = tf.random.normal(dependent_hidden_shape,
                                          dtype=tf.float32)
    other_dependencies = {
        'feature_name': other_hidden_layer,
    }

    # setup dummy output feature to be root of dependency list
    num_feature_defn = numerical_feature()
    num_feature_defn['loss'] = {'type': 'mean_squared_error'}
    num_feature_defn['dependencies'] = ['feature_name']
    if len(dependent_hidden_shape) > 2:
        num_feature_defn['reduce_dependencies'] = reduce_dependencies

    # Based on specification calculate expected resulting hidden size for
    # with one dependencies
    if reduce_dependencies == 'concat' and len(hidden_shape) == 2 and \
            len(dependent_hidden_shape) == 3:
        expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE * SEQ_SIZE
    else:
        expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE

    # set up if multiple dependencies specified, setup second dependent feature
    if dependent_hidden_shape2:
        other_hidden_layer2 = tf.random.normal(dependent_hidden_shape2,
                                               dtype=tf.float32)
        other_dependencies['feature_name2'] = other_hidden_layer2
        num_feature_defn['dependencies'].append('feature_name2')
        if len(dependent_hidden_shape2) > 2:
            num_feature_defn['reduce_dependencies'] = reduce_dependencies

        # Based on specification calculate marginal increase in resulting
        # hidden size with two dependencies
        if reduce_dependencies == 'concat' and len(hidden_shape) == 2 and \
                len(dependent_hidden_shape2) == 3:
            expected_hidden_size += dependent_hidden_shape2[-1] * SEQ_SIZE
        else:
            expected_hidden_size += dependent_hidden_shape2[-1]

    # test dependency concatenation
    out_feature = NumericalOutputFeature(num_feature_defn)
    results = out_feature.concat_dependencies(hidden_layer, other_dependencies)

    # confirm size of resutling concat_dependencies() call
    if len(hidden_shape) > 2:
        assert results.shape.as_list() == \
               [BATCH_SIZE, SEQ_SIZE, expected_hidden_size]
    else:
        assert results.shape.as_list() == [BATCH_SIZE, expected_hidden_size]

    del (out_feature)
Exemple #18
0
def test_ray_tabular():
    input_features = [
        sequence_feature(reduce_output='sum'),
        numerical_feature(normalization='zscore'),
        set_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        binary_feature(),
        set_feature(max_len=3, vocab_size=5),
        numerical_feature(normalization='zscore'),
        vector_feature(),
    ]
    run_test_parquet(input_features, output_features)
Exemple #19
0
def test_dask_split():
    input_features = [
        numerical_feature(normalization='zscore'),
        set_feature(),
        binary_feature(),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]
    run_test_parquet(input_features,
                     output_features,
                     run_fn=run_split_api_experiment)
Exemple #20
0
def _get_config(sampler, executor):
    input_features = [
        numerical_feature(), numerical_feature()
    ]
    output_features = [
        binary_feature()
    ]

    return {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "num_fc_layers": 2},
        "training": {"epochs": 2, "learning_rate": 0.001},
        "hyperopt": {
            **HYPEROPT_CONFIG,
            "executor": executor,
            "sampler": sampler,
        },
    }
Exemple #21
0
def test_server_integration(csv_filename):
     # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder='resnet',
            preprocessing={
                'in_memory': True,
                'height': 8,
                'width': 8,
                'num_channels': 3
            },
            fc_size=16,
            num_filters=8
        ),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    model = train_model(input_features, output_features, data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.post('/predict')
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)
    data, files = convert_to_form(data_df.T.to_dict()[0])
    response = client.post('/predict', data=data, files=files)

    response_keys = sorted(list(response.json().keys()))
    assert response_keys == sorted(output_keys_for(output_features))

    shutil.rmtree(model.exp_dir_name, ignore_errors=True)
    shutil.rmtree(image_dest_folder)
Exemple #22
0
def test_multiple_dependencies(reduce_dependencies, hidden_shape,
                               dependent_hidden_shape,
                               dependent_hidden_shape2):
    # setup at least for a single dependency
    hidden_layer = torch.randn(hidden_shape, dtype=torch.float32)
    other_hidden_layer = torch.randn(dependent_hidden_shape,
                                     dtype=torch.float32)
    other_dependencies = {
        "feature_name": other_hidden_layer,
    }

    # setup dummy output feature to be root of dependency list
    num_feature_defn = numerical_feature()
    num_feature_defn["loss"] = {"type": "mean_squared_error"}
    num_feature_defn["dependencies"] = ["feature_name"]
    if len(dependent_hidden_shape) > 2:
        num_feature_defn["reduce_dependencies"] = reduce_dependencies

    # Based on specification calculate expected resulting hidden size for
    # with one dependencies
    if reduce_dependencies == "concat" and len(hidden_shape) == 2 and len(
            dependent_hidden_shape) == 3:
        expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE * SEQ_SIZE
    else:
        expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE

    # set up if multiple dependencies specified, setup second dependent feature
    if dependent_hidden_shape2:
        other_hidden_layer2 = torch.randn(dependent_hidden_shape2,
                                          dtype=torch.float32)
        other_dependencies["feature_name2"] = other_hidden_layer2
        num_feature_defn["dependencies"].append("feature_name2")
        if len(dependent_hidden_shape2) > 2:
            num_feature_defn["reduce_dependencies"] = reduce_dependencies

        # Based on specification calculate marginal increase in resulting
        # hidden size with two dependencies
        if reduce_dependencies == "concat" and len(hidden_shape) == 2 and len(
                dependent_hidden_shape2) == 3:
            expected_hidden_size += dependent_hidden_shape2[-1] * SEQ_SIZE
        else:
            expected_hidden_size += dependent_hidden_shape2[-1]

    # test dependency concatenation
    num_feature_defn["input_size"] = expected_hidden_size
    out_feature = NumericalOutputFeature(num_feature_defn)
    results = out_feature.concat_dependencies(hidden_layer, other_dependencies)

    # confirm size of resutling concat_dependencies() call
    if len(hidden_shape) > 2:
        assert results.shape == (BATCH_SIZE, SEQ_SIZE, expected_hidden_size)
    else:
        assert results.shape == (BATCH_SIZE, expected_hidden_size)

    del out_feature
def test_validation_metrics(test_case: TestCase, csv_filename: str):
    # setup test scenarios
    test_scenarios = []
    for output_feature in test_case.output_features:
        # single output feature capture feature specific metrics
        of_name = output_feature[NAME]
        for metric in test_case.validation_metrics:
            test_scenarios.append((of_name, metric))
            if len(test_case.output_features) == 1:
                # it shoudl work when there's only one output feature
                # and the metric applyys to the output feature type,
                # the output feature name should be replacing combined
                # and a warning should be printed about the substitution
                test_scenarios.append(('combined', metric))

    # add standard test for combined
    test_scenarios.append(('combined', 'loss'))

    # setup features for the test
    input_features = [numerical_feature(),
                      category_feature(),
                      binary_feature()]
    output_features = test_case.output_features

    # generate training data
    training_data = generate_data(
        input_features,
        output_features,
        filename=csv_filename
    )

    # loop through scenarios
    for validation_field, validation_metric in test_scenarios:
        # setup config
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'training': {
                'epochs': 3,
                'validation_field': validation_field,
                'validation_metric': validation_metric
            }
        }

        model = LudwigModel(config)
        model.train(
            dataset=training_data,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_log=True,
            skip_save_model=True,
            skip_save_processed_input=True,
            skip_save_progress=True
        )
def test_experiment_multiple_seq_seq(csv_filename, output_features):
    input_features = [
        text_feature(vocab_size=100, min_len=1, encoder='stacked_cnn'),
        numerical_feature(normalization='zscore'),
        category_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder='embed')
    ]
    output_features = output_features

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)
Exemple #25
0
def test_experiment_infer_image_metadata(csv_filename: str):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder, encoder="stacked_cnn", fc_size=16, num_filters=8),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="zscore"),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum"), numerical_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)

    # remove image preprocessing section to force inferring image meta data
    input_features[0].pop("preprocessing")

    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
def test_kfold_cv_api_from_file():
    # k-fold_cross_validate api with config_file
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, "train.csv")
        config_fp = os.path.join(tmpdir, "config.yaml")

        # generate synthetic data for the test
        input_features = [numerical_feature(normalization="zscore"), numerical_feature(normalization="zscore")]

        output_features = [category_feature(vocab_size=3, reduce_input="sum")]

        generate_data(input_features, output_features, training_data_fp)

        # generate config file
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {"type": "concat", "fc_size": 14},
            "training": {"epochs": 2},
        }

        with open(config_fp, "w") as f:
            yaml.dump(config, f)

        # test kfold_cross_validate api with config file

        # execute k-fold cross validation run
        (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config_fp, dataset=training_data_fp)

        # correct structure for results from kfold cv
        for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]:
            assert key in kfold_cv_stats

        for key in ["fold_" + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
def test_experiment_image_inputs(image_params: ImageParams, csv_filename: str):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 12,
                "width": 12,
                "num_channels": 3,
                "num_processes": 5
            },
            fc_size=16,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="zscore"),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
        numerical_feature()
    ]

    input_features[0]["encoder"] = image_params.image_encoder
    input_features[0]["preprocessing"][
        "in_memory"] = image_params.in_memory_flag
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(
        input_features,
        output_features,
        dataset=rel_path,
        skip_save_processed_input=image_params.skip_save_processed_input,
    )

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Exemple #28
0
def test_ray_split():
    input_features = [
        numerical_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]
    run_test_parquet(
        input_features,
        output_features,
        run_fn=run_split_api_experiment,
        num_cpus=4,
    )
def test_kfold_cv_dataset_formats(data_format):
    # k-fold_cross_validate api with in-memory config
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, "train.csv")

        # generate synthetic data for the test
        input_features = [numerical_feature(normalization="zscore"), numerical_feature(normalization="zscore")]

        output_features = [numerical_feature()]

        generate_data(input_features, output_features, training_data_fp)
        dataset_to_use = create_data_set_to_use(data_format, training_data_fp)

        # generate config file
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {"type": "concat", "fc_size": 14},
            "training": {"epochs": 2},
        }

        # test kfold_cross_validate api with config in-memory

        # execute k-fold cross validation run
        (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config, dataset=dataset_to_use)

        # correct structure for results from kfold cv
        for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]:
            assert key in kfold_cv_stats

        for key in ["fold_" + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
Exemple #30
0
def test_experiment_multiple_seq_seq(csv_filename):
    # Multiple inputs, Multiple outputs
    input_features = [
        text_feature(vocab_size=100, min_len=1, encoder='stacked_cnn'),
        numerical_feature(),
        categorical_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder='embed')
    ]
    output_features = [
        categorical_feature(vocab_size=2, reduce_input='sum'),
        sequence_feature(vocab_size=10, max_len=5),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Use generator as decoder
    output_features = [
        categorical_feature(vocab_size=2, reduce_input='sum'),
        sequence_feature(vocab_size=10, max_len=5, decoder='generator'),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Generator decoder and reduce_input = None
    output_features = [
        categorical_feature(vocab_size=2, reduce_input='sum'),
        sequence_feature(max_len=5, decoder='generator', reduce_input=None),
        numerical_feature()
    ]
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)