import warnings

import numpy as np
import pandas as pd
import pytest
from sklearn.metrics import f1_score, mean_squared_error

from datawig.column_encoders import BowEncoder
from datawig.mxnet_input_symbols import BowFeaturizer
from datawig.simple_imputer import SimpleImputer
from datawig.utils import logger, rand_string, random_split, generate_df_numeric, generate_df_string
from datawig import column_encoders

warnings.filterwarnings("ignore")

logger.setLevel("INFO")


def test_simple_imputer_no_string_column_name():
    with pytest.raises(ValueError):
        SimpleImputer([0], '1')
    with pytest.raises(ValueError):
        SimpleImputer(['0'], 1)


def test_simple_imputer_real_data_default_args(test_dir, data_frame):
    """
    Tests SimpleImputer with default options

    """
    feature_col = "string_feature"
Esempio n. 2
0
# permissions and limitations under the License.
"""

DataWig tests for explaining predictions

"""

import os
import datawig
from datawig.utils import random_split
from datawig.utils import logger
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
from datawig import Imputer
logger.setLevel("DEBUG")


def test_explain_method_synthetic(test_dir):

    # Generate simulated data for testing explain method
    # Predict output column with entries in ['foo', 'bar'] from two columns, one
    # categorical in ['foo', 'dummy'], one text in ['text_foo_text', 'text_dummy_text'].
    # the output column is deterministically 'foo', if 'foo' occurs anywhere in any input column.
    N = 100
    cat_in_col = ['foo' if r > (1 / 2) else 'dummy' for r in np.random.rand(N)]
    text_in_col = ['fff' if r > (1 / 2) else 'ddd' for r in np.random.rand(N)]
    hash_in_col = ['h' for r in range(N)]
    cat_out_col = [
        'foo' if 'f' in input[0] + input[1] else 'bar'
        for input in zip(cat_in_col, text_in_col)