Beispiel #1
0
def test_missing_values_drop_rows(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}}
    input_features = [
        number_feature(),
        binary_feature(),
        category_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(**kwargs),
        number_feature(**kwargs),
        category_feature(vocab_size=3, **kwargs),
        sequence_feature(vocab_size=3, **kwargs),
        text_feature(vocab_size=3, **kwargs),
        set_feature(vocab_size=3, **kwargs),
        vector_feature(),
    ]
    backend = LocalTestBackend()
    config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}}

    training_data_csv_path = generate_data(input_features, output_features, data_csv_path)
    df = read_csv_with_nan(training_data_csv_path, nan_percent=0.1)

    # run preprocessing
    ludwig_model = LudwigModel(config, backend=backend)
    ludwig_model.preprocess(dataset=df)
Beispiel #2
0
def test_torchscript_e2e_tabnet_combiner(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    # Configure features to be tested:
    input_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        bag_feature(vocab_size=3),
        set_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        COMBINER: {
            "type": "tabnet",
            "num_total_blocks": 2,
            "num_shared_blocks": 2,
        },
        TRAINER: {
            "epochs": 2
        },
    }

    # Generate training data
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    validate_torchscript_outputs(tmpdir, config, backend,
                                 training_data_csv_path)
Beispiel #3
0
def test_experiment_infer_image_metadata(tmpdir):
    # Image Inputs
    image_dest_folder = os.path.join(tmpdir, "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder="stacked_cnn",
                      output_size=16,
                      num_filters=8),
        text_feature(encoder="embed", min_len=1),
        number_feature(normalization="zscore"),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
        number_feature()
    ]

    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    # remove image preprocessing section to force inferring image meta data
    input_features[0].pop("preprocessing")

    run_experiment(input_features, output_features, dataset=rel_path)
Beispiel #4
0
def test_custom_encoder_decoder():
    input_features = [
        sequence_feature(reduce_output="sum"),
        number_feature(encoder="custom_number_encoder"),
    ]
    output_features = [
        number_feature(decoder="custom_number_decoder"),
    ]
    _run_test(input_features=input_features, output_features=output_features)
Beispiel #5
0
def test_image_resizing_num_channel_handling(tmpdir):
    """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to
    train a model. This checks the cases where the user may or may not specify a number of channels in the config.

    :param csv_filename:
    :return:
    """
    # Image Inputs
    image_dest_folder = os.path.join(tmpdir, "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 8,
                "width": 8,
                "num_channels": 3,
                "num_processes": 5
            },
            output_size=8,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        number_feature(normalization="minmax"),
    ]
    output_features = [binary_feature(), number_feature()]
    rel_path = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, "dataset1.csv"),
                             num_examples=50)

    df1 = read_csv(rel_path)

    input_features[0]["preprocessing"]["num_channels"] = 1
    rel_path = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, "dataset2.csv"),
                             num_examples=50)
    df2 = read_csv(rel_path)

    df = concatenate_df(df1, df2, None, LOCAL_BACKEND)
    df.to_csv(rel_path, index=False)

    # Here the user specifies number of channels. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    del input_features[0]["preprocessing"]["num_channels"]

    # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)
Beispiel #6
0
def test_torchscript_e2e_tabular(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    # Configure features to be tested:
    bin_str_feature = binary_feature()
    transformed_number_features = [
        number_feature(preprocessing={"normalization": numeric_transformer})
        for numeric_transformer in numeric_transformation_registry.keys()
    ]
    input_features = [
        bin_str_feature,
        binary_feature(),
        *transformed_number_features,
        category_feature(vocab_size=3),
        bag_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
        # TODO: future support
        # date_feature(),
        # h3_feature(),
    ]
    output_features = [
        bin_str_feature,
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    # Generate training data
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    # Convert bool values to strings, e.g., {'Yes', 'No'}
    df = pd.read_csv(training_data_csv_path)
    false_value, true_value = "No", "Yes"
    df[bin_str_feature[NAME]] = df[bin_str_feature[NAME]].map(
        lambda x: true_value if x else false_value)
    df.to_csv(training_data_csv_path)

    validate_torchscript_outputs(tmpdir, config, backend,
                                 training_data_csv_path)
Beispiel #7
0
def test_kfold_cv_api_from_file():
    # k-fold_cross_validate api with config file
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, "train.csv")
        config_fp = os.path.join(tmpdir, "config.yaml")

        # generate synthetic data for the test
        input_features = [
            number_feature(normalization="zscore"),
            number_feature(normalization="zscore")
        ]

        output_features = [category_feature(vocab_size=3, reduce_input="sum")]

        generate_data(input_features, output_features, training_data_fp)

        # generate config file
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "output_size": 14
            },
            TRAINER: {
                "epochs": 2
            },
        }

        with open(config_fp, "w") as f:
            yaml.dump(config, f)

        # test kfold_cross_validate api with config file

        # execute k-fold cross validation run
        (kfold_cv_stats,
         kfold_split_indices) = kfold_cross_validate(3,
                                                     config=config_fp,
                                                     dataset=training_data_fp)

        # correct structure for results from kfold cv
        for key in ["fold_" + str(i + 1)
                    for i in range(num_folds)] + ["overall"]:
            assert key in kfold_cv_stats

        for key in ["fold_" + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
Beispiel #8
0
def test_config_features():
    all_input_features = [
        audio_feature("/tmp/destination_folder"),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature("/tmp/destination_folder"),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature["type"] not in output_type_registry.keys()
    ]
    for input_feature in input_only_features:
        config = {
            "input_features": all_input_features,
            "output_features": all_output_features + [input_feature],
        }

        dtype = input_feature["type"]
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
Beispiel #9
0
def test_kfold_cv_dataset_formats(data_format):
    # k-fold_cross_validate api with in-memory config
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, "train.csv")

        # generate synthetic data for the test
        input_features = [
            number_feature(normalization="zscore"),
            number_feature(normalization="zscore")
        ]

        output_features = [number_feature()]

        generate_data(input_features, output_features, training_data_fp)
        dataset_to_use = create_data_set_to_use(data_format, training_data_fp)

        # generate config file
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "output_size": 14
            },
            TRAINER: {
                "epochs": 2
            },
        }

        # test kfold_cross_validate api with config in-memory

        # execute k-fold cross validation run
        (kfold_cv_stats,
         kfold_split_indices) = kfold_cross_validate(3,
                                                     config=config,
                                                     dataset=dataset_to_use)

        # correct structure for results from kfold cv
        for key in ["fold_" + str(i + 1)
                    for i in range(num_folds)] + ["overall"]:
            assert key in kfold_cv_stats

        for key in ["fold_" + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
def _get_config(sampler, executor):
    input_features = [number_feature(), number_feature()]
    output_features = [binary_feature()]

    return {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "num_fc_layers": 2},
        TRAINER: {"epochs": 2, "learning_rate": 0.001},
        "hyperopt": {
            **HYPEROPT_CONFIG,
            "executor": executor,
            "sampler": sampler,
        },
    }
Beispiel #11
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        INPUT_FEATURES: all_input_features,
        OUTPUT_FEATURES: all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINER] = {"batch_size": 42}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT][EXECUTOR][SCHEDULER] = SCHEDULER_DICT

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else ECDTrainerConfig().early_stop
    assert merged_config[TRAINER]["early_stop"] == expected
Beispiel #12
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINER] = {"batch_size": "42"}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT]["sampler"]["scheduler"] = SCHEDULER

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else default_early_stop
    assert merged_config[TRAINER]["early_stop"] == expected
Beispiel #13
0
def _run_test(input_features=None, output_features=None, combiner=None):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = input_features or [
            sequence_feature(reduce_output="sum"),
            number_feature(),
        ]
        output_features = output_features or [
            category_feature(vocab_size=2, reduce_input="sum")
        ]
        combiner = combiner or {"type": "concat"}

        csv_filename = os.path.join(tmpdir, "training.csv")
        data_csv = generate_data(input_features, output_features, csv_filename)

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": combiner,
            TRAINER: {
                "epochs": 2
            },
        }

        model = LudwigModel(config, backend=LocalTestBackend())
        _, _, output_directory = model.train(
            dataset=data_csv,
            output_directory=tmpdir,
        )
        model.predict(dataset=data_csv, output_directory=output_directory)
Beispiel #14
0
def test_empty_split_error(backend, tmpdir):
    """Tests that an error is raised if one or more of the splits is empty after preprocessing."""
    data_csv_path = os.path.join(tmpdir, "data.csv")

    out_feat = binary_feature()
    input_features = [number_feature()]
    output_features = [out_feat]
    config = {
        "input_features": input_features,
        "output_features": output_features
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # Convert all the output features rows to null. Because the default missing value strategy is to drop empty output
    # rows, this will result in the dataset being empty after preprocessing.
    df[out_feat[COLUMN]] = None

    with init_backend(backend):
        ludwig_model = LudwigModel(config, backend=backend)
        with pytest.raises(ValueError,
                           match="Dataset is empty following preprocessing"):
            ludwig_model.preprocess(dataset=df)
def test_config_trainer_bad_optimizer():
    config = {
        "input_features": [
            category_feature(vocab_size=2, reduce_input="sum"),
            number_feature(),
        ],
        "output_features": [binary_feature(weight_regularization=None)],
        "combiner": {
            "type": "tabnet",
        },
        TRAINER: {},
    }
    validate_config(config)

    # Test manually set-to-null optimizer vs unspecified:
    config[TRAINER]["optimizer"] = None
    with pytest.raises(ValidationError):
        validate_config(config)
    assert ECDTrainerConfig.Schema().load({}).optimizer is not None

    # Test all types in optimizer_registry supported:
    for key in optimizer_registry.keys():
        config[TRAINER]["optimizer"] = {"type": key}
        validate_config(config)

    # Test invalid optimizer type:
    config[TRAINER]["optimizer"] = {"type": 0}
    with pytest.raises(ValidationError):
        validate_config(config)
    config[TRAINER]["optimizer"] = {"type": {}}
    with pytest.raises(ValidationError):
        validate_config(config)
    config[TRAINER]["optimizer"] = {"type": "invalid"}
    with pytest.raises(ValidationError):
        validate_config(config)
Beispiel #16
0
def test_number_feature_wrong_dtype(csv_filename, tmpdir):
    """Tests that a number feature with all string values is treated as having missing values by default."""
    data_csv_path = os.path.join(tmpdir, csv_filename)

    num_feat = number_feature()
    input_features = [num_feat]
    output_features = [binary_feature()]
    config = {
        "input_features": input_features,
        "output_features": output_features
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # convert numbers to random strings
    def random_string():
        letters = string.ascii_lowercase
        return "".join(random.choice(letters) for _ in range(10))

    df[num_feat[COLUMN]] = df[num_feat[COLUMN]].apply(
        lambda _: random_string())

    # run preprocessing
    backend = LocalTestBackend()
    ludwig_model = LudwigModel(config, backend=backend)
    train_ds, val_ds, test_ds, _ = ludwig_model.preprocess(dataset=df)

    concatenated_df = concatenate_df(train_ds.to_df(), val_ds.to_df(),
                                     test_ds.to_df(), backend)

    # check that train_ds had invalid values replaced with the missing value
    assert len(concatenated_df) == len(df)
    assert np.all(concatenated_df[num_feat[PROC_COLUMN]] == 0.0)
Beispiel #17
0
def test_multiple_dependencies(reduce_dependencies, hidden_shape,
                               dependent_hidden_shape,
                               dependent_hidden_shape2):
    # setup at least for a single dependency
    hidden_layer = torch.randn(hidden_shape, dtype=torch.float32)
    other_hidden_layer = torch.randn(dependent_hidden_shape,
                                     dtype=torch.float32)
    other_dependencies = {
        "feature_name": other_hidden_layer,
    }

    # setup dummy output feature to be root of dependency list
    num_feature_defn = number_feature()
    num_feature_defn["loss"] = {"type": "mean_squared_error"}
    num_feature_defn["dependencies"] = ["feature_name"]
    if len(dependent_hidden_shape) > 2:
        num_feature_defn["reduce_dependencies"] = reduce_dependencies

    # Based on specification calculate expected resulting hidden size for
    # with one dependencies
    if reduce_dependencies == "concat" and len(hidden_shape) == 2 and len(
            dependent_hidden_shape) == 3:
        expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE * SEQ_SIZE
    else:
        expected_hidden_size = HIDDEN_SIZE + OTHER_HIDDEN_SIZE

    # set up if multiple dependencies specified, setup second dependent feature
    if dependent_hidden_shape2:
        other_hidden_layer2 = torch.randn(dependent_hidden_shape2,
                                          dtype=torch.float32)
        other_dependencies["feature_name2"] = other_hidden_layer2
        num_feature_defn["dependencies"].append("feature_name2")
        if len(dependent_hidden_shape2) > 2:
            num_feature_defn["reduce_dependencies"] = reduce_dependencies

        # Based on specification calculate marginal increase in resulting
        # hidden size with two dependencies
        if reduce_dependencies == "concat" and len(hidden_shape) == 2 and len(
                dependent_hidden_shape2) == 3:
            expected_hidden_size += dependent_hidden_shape2[-1] * SEQ_SIZE
        else:
            expected_hidden_size += dependent_hidden_shape2[-1]

    # Set up dependency reducers.
    dependency_reducers = torch.nn.ModuleDict()
    for feature_name in other_dependencies.keys():
        dependency_reducers[feature_name] = SequenceReducer(
            reduce_mode=reduce_dependencies)

    # test dependency concatenation
    num_feature_defn["input_size"] = expected_hidden_size
    results = output_feature_utils.concat_dependencies(
        "num_feature", num_feature_defn["dependencies"], dependency_reducers,
        hidden_layer, other_dependencies)

    # confirm size of resulting concat_dependencies() call
    if len(hidden_shape) > 2:
        assert results.shape == (BATCH_SIZE, SEQ_SIZE, expected_hidden_size)
    else:
        assert results.shape == (BATCH_SIZE, expected_hidden_size)
Beispiel #18
0
def run_test_gbm_multiple_outputs(tmpdir, backend_config):
    """Test that an error is raised when the model is trained with multiple outputs."""
    input_features = [number_feature(), category_feature(reduce_output="sum")]
    output_features = [
        category_feature(vocab_size=3),
        binary_feature(),
        category_feature(vocab_size=3),
    ]

    csv_filename = os.path.join(tmpdir, "training.csv")
    dataset_filename = generate_data(input_features,
                                     output_features,
                                     csv_filename,
                                     num_examples=100)

    config = {
        MODEL_TYPE: "gbm",
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "num_boost_round": 2
        },
    }

    model = LudwigModel(config, backend=backend_config)
    with pytest.raises(ValueError,
                       match="Only single task currently supported"):
        model.train(dataset=dataset_filename, output_directory=tmpdir)
Beispiel #19
0
def test_model_weights_match_training(tmpdir, csv_filename):
    np.random.seed(1)

    input_features = [number_feature()]
    output_features = [number_feature()]
    output_feature_name = output_features[0][NAME]

    # Generate test data
    data_csv_path = generate_data(input_features,
                                  output_features,
                                  os.path.join(tmpdir, csv_filename),
                                  num_examples=100)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "trainer": {
            "epochs": 5,
            "batch_size": 32
        },
    }

    model = LudwigModel(config=config, )

    training_stats, _, _ = model.train(training_set=data_csv_path,
                                       random_seed=1919)

    # generate predicitons from training data
    df = pd.read_csv(data_csv_path)
    predictions = model.predict(df)

    # compute loss on predictions from training data
    loss_function = MSELoss()
    loss = loss_function(
        torch.tensor(predictions[0][output_feature_name +
                                    "_predictions"].values),  # predictions
        torch.tensor(df[output_feature_name].values),  # target
    ).type(torch.float32)

    # get last loss value from training
    last_training_loss = torch.tensor(
        training_stats[TRAINING][output_feature_name][LOSS][-1])

    # loss from predictions should match last loss value recorded during training
    assert torch.isclose(loss, last_training_loss), (
        "Model predictions on training set did not generate same loss value as in training. "
        "Need to confirm that weights were correctly captured in model.")
Beispiel #20
0
def run_test_gbm_number(tmpdir, backend_config):
    """Test that the GBM model can train and predict a numerical output (regression)."""
    # Given a dataset with a single input feature and a single output feature,
    input_features = [number_feature(), category_feature(reduce_output="sum")]
    output_feature = number_feature()
    output_features = [output_feature]

    csv_filename = os.path.join(tmpdir, "training.csv")
    dataset_filename = generate_data(input_features,
                                     output_features,
                                     csv_filename,
                                     num_examples=100)

    config = {
        MODEL_TYPE: "gbm",
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "num_boost_round": 2
        },
    }

    # When I train a model on the dataset, load the model from the output directory, and
    # predict on the dataset
    model = LudwigModel(config, backend=backend_config)

    model.train(
        dataset=dataset_filename,
        output_directory=tmpdir,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_log=True,
    )
    model.load(os.path.join(tmpdir, "api_experiment_run", "model"))
    preds, _ = model.predict(
        dataset=dataset_filename,
        output_directory=os.path.join(tmpdir, "predictions"),
    )

    # Then the predictions should be included in the output
    pred_col = preds[output_feature["name"] + "_predictions"]
    if backend_config["type"] == "ray":
        pred_col = pred_col.compute()
    assert pred_col.dtype == float
Beispiel #21
0
def test_train_gpu_load_cpu():
    input_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
        number_feature(normalization="zscore"),
    ]
    output_features = [
        binary_feature(),
    ]
    run_test_with_features(input_features, output_features, run_fn=_run_train_gpu_load_cpu, num_gpus=1)
Beispiel #22
0
def test_config_input_output_features():
    config = {
        "input_features": [
            category_feature(),
            number_feature(),
        ],
        "output_features": [binary_feature()],
    }

    validate_config(config)
Beispiel #23
0
def test_experiment_dataset_formats(data_format, csv_filename):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting

    input_features = [number_feature(), category_feature()]
    output_features = [category_feature(), number_feature()]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        "preprocessing": {},
        TRAINER: {
            "epochs": 2
        },
    }

    # setup training data format to test
    raw_data = generate_data(input_features, output_features, csv_filename)

    training_set_metadata = None

    if data_format == "hdf5":
        # hdf5 format
        training_set, _, _, training_set_metadata = preprocess_for_training(
            config, dataset=raw_data)
        dataset_to_use = training_set.data_hdf5_fp
    else:
        dataset_to_use = create_data_set_to_use(data_format, raw_data)

    # define Ludwig model
    model = LudwigModel(config=config)
    model.train(dataset=dataset_to_use,
                training_set_metadata=training_set_metadata,
                random_seed=default_random_seed)

    # # run functions with the specified data format
    model.evaluate(dataset=dataset_to_use)
    model.predict(dataset=dataset_to_use)
Beispiel #24
0
def test_ray_tabular(df_engine):
    if df_engine == "modin" and sys.version_info < (3, 7):
        pytest.skip("Modin is not supported with Python 3.6 at this time")

    input_features = [
        sequence_feature(reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
        number_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [
        binary_feature(),
        number_feature(normalization="zscore"),
    ]
    run_test_parquet(input_features, output_features, df_engine=df_engine)
Beispiel #25
0
def test_missing_values_drop_rows(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}}
    input_features = [
        number_feature(),
        binary_feature(),
        category_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(**kwargs),
        number_feature(**kwargs),
        category_feature(vocab_size=3, **kwargs),
        sequence_feature(vocab_size=3, **kwargs),
        text_feature(vocab_size=3, **kwargs),
        set_feature(vocab_size=3, **kwargs),
        vector_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # set 10% of values to NaN
    nan_percent = 0.1
    ix = [(row, col) for row in range(df.shape[0])
          for col in range(df.shape[1])]
    for row, col in random.sample(ix, int(round(nan_percent * len(ix)))):
        df.iat[row, col] = np.nan

    # run preprocessing
    ludwig_model = LudwigModel(config, backend=backend)
    ludwig_model.preprocess(dataset=df)
Beispiel #26
0
def test_experiment_multiple_seq_seq(csv_filename, output_features):
    input_features = [
        text_feature(vocab_size=100, min_len=1, encoder="stacked_cnn"),
        number_feature(normalization="zscore"),
        category_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder="embed"),
    ]
    output_features = output_features

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)
Beispiel #27
0
def test_incorrect_output_features_config():

    config = {
        "input_features": [
            number_feature(),
        ],
        "output_features": [binary_feature(decoder="classifier")],
    }

    # Invalid decoder for binary output feature
    with pytest.raises(ValidationError):
        validate_config(config)
Beispiel #28
0
def test_ray_split():
    input_features = [
        number_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]
    run_test_with_features(
        input_features,
        output_features,
        run_fn=run_split_api_experiment,
        num_cpus=4,
    )
Beispiel #29
0
def test_ray_tabular(df_engine):
    input_features = [
        sequence_feature(reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
        number_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [
        binary_feature(bool2str=["No", "Yes"]),
        binary_feature(),
        number_feature(normalization="zscore"),
    ]
    run_test_with_features(
        input_features,
        output_features,
        df_engine=df_engine,
    )
Beispiel #30
0
def test_experiment_image_inputs(image_params: ImageParams, tmpdir):
    # Image Inputs
    image_dest_folder = os.path.join(tmpdir, "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 12,
                "width": 12,
                "num_channels": 3,
                "num_processes": 5
            },
            output_size=16,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        number_feature(normalization="zscore"),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
        number_feature()
    ]

    input_features[0]["encoder"] = image_params.image_encoder
    input_features[0]["preprocessing"][
        "in_memory"] = image_params.in_memory_flag
    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    run_experiment(
        input_features,
        output_features,
        dataset=rel_path,
        skip_save_processed_input=image_params.skip_save_processed_input,
    )