Ejemplo n.º 1
0
def predict_with_backend(tmpdir,
                         config,
                         data_csv_path,
                         backend,
                         patch_args=None):
    with init_backend(backend):
        if backend == "ray":
            backend = RAY_BACKEND_CONFIG
            backend["processor"]["type"] = "dask"

        ludwig_model = LudwigModel(config, backend=backend)
        _, _, output_directory = ludwig_model.train(
            dataset=data_csv_path,
            output_directory=os.path.join(tmpdir, "output"),
        )
        # Check that metadata JSON saves and loads correctly
        ludwig_model = LudwigModel.load(os.path.join(output_directory,
                                                     "model"))

        if patch_args is not None:
            with mock.patch(*patch_args):
                preds_df, _ = ludwig_model.predict(dataset=data_csv_path)
        else:
            preds_df, _ = ludwig_model.predict(dataset=data_csv_path)

    return preds_df, ludwig_model
Ejemplo n.º 2
0
def test_empty_split_error(backend, tmpdir):
    """Tests that an error is raised if one or more of the splits is empty after preprocessing."""
    data_csv_path = os.path.join(tmpdir, "data.csv")

    out_feat = binary_feature()
    input_features = [number_feature()]
    output_features = [out_feat]
    config = {
        "input_features": input_features,
        "output_features": output_features
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # Convert all the output features rows to null. Because the default missing value strategy is to drop empty output
    # rows, this will result in the dataset being empty after preprocessing.
    df[out_feat[COLUMN]] = None

    with init_backend(backend):
        ludwig_model = LudwigModel(config, backend=backend)
        with pytest.raises(ValueError,
                           match="Dataset is empty following preprocessing"):
            ludwig_model.preprocess(dataset=df)
Ejemplo n.º 3
0
def test_sample_ratio(backend, tmpdir):
    num_examples = 100
    sample_ratio = 0.25

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=5, reduce_input="sum")]
    data_csv = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, "dataset.csv"),
                             num_examples=num_examples)
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "trainer": {
            "epochs": 2,
        },
        "preprocessing": {
            "sample_ratio": sample_ratio
        },
    }

    with init_backend(backend):
        model = LudwigModel(config, backend=backend)
        train_set, val_set, test_set, _ = model.preprocess(
            data_csv,
            skip_save_processed_input=True,
        )

        sample_size = num_examples * sample_ratio
        count = len(train_set) + len(val_set) + len(test_set)
        assert sample_size == count
Ejemplo n.º 4
0
def test_dask_known_divisions(feature_fn, csv_filename, tmpdir):
    import dask.dataframe as dd

    input_features = [feature_fn(os.path.join(tmpdir, "generated_output"))]
    output_features = [category_feature(vocab_size=5, reduce_input="sum")]

    # num_examples=100 and npartitions=2 to ensure the test is not flaky, by having non-empty post-split datasets.
    data_csv = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, csv_filename),
                             num_examples=100)
    data_df = dd.from_pandas(pd.read_csv(data_csv), npartitions=2)
    assert data_df.known_divisions

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "trainer": {
            "epochs": 2,
        },
    }

    backend = "ray"
    with init_backend(backend):
        model = LudwigModel(config, backend=backend)
        train_set, val_set, test_set, _ = model.preprocess(
            data_df,
            skip_save_processed_input=False,
        )
Ejemplo n.º 5
0
def test_missing_values_fill_with_mean(backend, csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    kwargs = {PREPROCESSING: {"missing_value_strategy": FILL_WITH_MEAN}}
    input_features = [
        number_feature(**kwargs),
        binary_feature(),
        category_feature(vocab_size=3),
    ]
    output_features = [binary_feature()]
    training_data_csv_path = generate_data(input_features, output_features, data_csv_path)

    config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}}
    with init_backend(backend):
        # run preprocessing
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.preprocess(dataset=training_data_csv_path)
Ejemplo n.º 6
0
def test_with_split(backend, csv_filename, tmpdir):
    num_examples = NUM_EXAMPLES
    train_set_size = int(num_examples * 0.8)
    val_set_size = int(num_examples * 0.1)
    test_set_size = int(num_examples * 0.1)

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=5, reduce_input="sum")]
    data_csv = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, csv_filename),
                             num_examples=num_examples)
    data_df = pd.read_csv(data_csv)
    data_df["split"] = [0] * train_set_size + [1] * val_set_size + [
        2
    ] * test_set_size
    data_df.to_csv(data_csv, index=False)
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "trainer": {
            "epochs": 2,
        },
        "preprocessing": {
            "split": {
                "type": "fixed"
            }
        },
    }

    with init_backend(backend):
        model = LudwigModel(config, backend=backend)
        train_set, val_set, test_set, _ = model.preprocess(
            data_csv,
            skip_save_processed_input=False,
        )
        assert len(train_set) == train_set_size
        assert len(val_set) == val_set_size
        assert len(test_set) == test_set_size