Esempio n. 1
0
def test_imputer_numeric_data(test_dir):
    """
    Tests numeric encoder/featurizer only

    """
    # Training data
    N = 1000
    x = np.random.uniform(-np.pi, np.pi, (N, ))
    df = pd.DataFrame({'x': x, 'cos': np.cos(x), '*2': x * 2, '**2': x**2})

    df_train, df_test = random_split(df, [.6, .4])
    output_path = os.path.join(test_dir, "tmp", "real_data_experiment_numeric")

    data_encoder_cols = [NumericalEncoder(['x'])]
    data_cols = [NumericalFeaturizer('x', numeric_latent_dim=100)]

    for target in ['*2', '**2', 'cos']:
        label_encoder_cols = [NumericalEncoder([target], normalize=False)]

        imputer = Imputer(data_featurizers=data_cols,
                          label_encoders=label_encoder_cols,
                          data_encoders=data_encoder_cols,
                          output_path=output_path)
        imputer.fit(train_df=df_train,
                    learning_rate=1e-1,
                    num_epochs=100,
                    patience=5,
                    test_split=.3,
                    weight_decay=.0,
                    batch_size=128)

        pred, metrics = imputer.transform_and_compute_metrics(df_test)
        df_test['predictions_' + target] = pred[target].flatten()
        print("Numerical metrics: {}".format(metrics[target]))
        assert metrics[target] < 10
Esempio n. 2
0
def test_imputer_load_read_exec_only_dir(tmpdir, data_frame):
    import stat

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'

    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir

    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)
    imputer.fit(train_df=df, num_epochs=1)

    # make tmpdir read/exec-only by owner/group/others
    os.chmod(
        tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD
        | stat.S_IRGRP | stat.S_IROTH)

    try:
        Imputer.load(tmpdir)
    except AssertionError as e:
        print(e)
        pytest.fail(
            'Loading imputer from read-only directory should not fail.')
Esempio n. 3
0
def test_imputer_without_train_df(test_dir):
    """
    Test asserting that imputer.fit fails without training data or training data in wrong format
    """
    df_train = ['ffffffooooo']

    data_encoder_cols = [BowEncoder('item_name')]
    label_encoder_cols = [CategoricalEncoder('brand')]

    data_cols = [BowFeaturizer('item_name')]

    output_path = os.path.join(test_dir, "tmp", "real_data_experiment")

    imputer = Imputer(
        data_featurizers=data_cols,
        label_encoders=label_encoder_cols,
        data_encoders=data_encoder_cols,
        output_path=output_path,
    )

    with pytest.raises(
            ValueError,
            message="Need a non-empty DataFrame for fitting Imputer model"):
        imputer.fit(train_df=df_train)

    with pytest.raises(
            ValueError,
            message="Need a non-empty DataFrame for fitting Imputer model"):
        imputer.fit(train_df=None)
Esempio n. 4
0
def test_imputer_without_test_set_random_split():
    """
    Test asserting that the random split is working internally
    by calling imputer.fit only with a training set.
    """

    feature_col = "string_feature"
    label_col = "label"

    n_samples = 5000
    num_labels = 3
    seq_len = 20
    vocab_size = int(2 ** 10)

    # generate some random data
    df_train = generate_string_data_frame(feature_col=feature_col,
                                             label_col=label_col,
                                             vocab_size=vocab_size,
                                             num_labels=num_labels,
                                             num_words=seq_len,
                                             n_samples=n_samples)


    num_epochs = 1
    batch_size = 64
    learning_rate = 1e-3

    data_encoder_cols = [
        BowEncoder(feature_col, max_tokens=vocab_size)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col, vocab_size=vocab_size)
    ]

    output_path = os.path.join(dir_path, "resources", "tmp", "real_data_experiment")

    imputer = Imputer(
        data_featurizers=data_cols,
        label_encoders=label_encoder_cols,
        data_encoders=data_encoder_cols,
        output_path=output_path
    )

    try:
        imputer.fit(
            train_df=df_train,
            learning_rate=learning_rate,
            num_epochs=num_epochs,
            batch_size=batch_size
        )
    except TypeError:
        pytest.fail("Didn't expect a TypeError exception with missing test data")

    shutil.rmtree(output_path)
Esempio n. 5
0
def test_imputer_load_with_invalid_context(tmpdir, data_frame):

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'

    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir

    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)
    imputer.fit(train_df=df, num_epochs=1)
    imputer.ctx = None
    imputer.save()

    imputer_deser = Imputer.load(tmpdir)
    _ = imputer_deser.predict(df)
Esempio n. 6
0
def test_imputer_fit_fail_non_writable_output_dir(tmpdir, data_frame):
    import stat

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'
    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir
    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)

    # make tmpdir read/exec-only by owner/group/others
    os.chmod(
        tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD
        | stat.S_IRGRP | stat.S_IROTH)

    # fail if imputer.fit does not raise an AssertionError
    with pytest.raises(AssertionError) as e:
        imputer.fit(df, num_epochs=1)
Esempio n. 7
0
def test_fit_resumes(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = Imputer(
        data_encoders=[TfIdfEncoder([feature_col])],
        data_featurizers=[
            datawig.mxnet_input_symbols.BowFeaturizer(feature_col)
        ],
        label_encoders=[CategoricalEncoder(label_col)],
        output_path=test_dir)

    assert imputer.module is None

    imputer.fit(df, num_epochs=20)
    first_fit_module = imputer.module

    imputer.fit(df, num_epochs=20)
    second_fit_module = imputer.module

    assert first_fit_module == second_fit_module
Esempio n. 8
0
def test_imputer_duplicate_encoder_output_columns(test_dir, data_frame):
    """
    Tests Imputer with sequential, bag-of-words and categorical variables as inputs
    this could be run as part of integration test suite.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 10
    seq_len = 100
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col, max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col,
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col, vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col,
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    output_path = os.path.join(test_dir, "tmp",
                               "imputer_experiment_synthetic_data")

    num_epochs = 20
    batch_size = 16
    learning_rate = 1e-3

    with pytest.raises(ValueError) as e:
        imputer = Imputer(data_featurizers=data_cols,
                          label_encoders=label_encoder_cols,
                          data_encoders=data_encoder_cols,
                          output_path=output_path)
        imputer.fit(train_df=df_train,
                    test_df=df_val,
                    learning_rate=learning_rate,
                    num_epochs=num_epochs,
                    batch_size=batch_size)