Example #1
0
def test_schema_equality():
    col = ColumnSchema(logical_type=Categorical)
    diff_description_col = ColumnSchema(logical_type=Categorical, description='description')
    diff_metadata_col = ColumnSchema(logical_type=Categorical, metadata={'interesting_values': ['a', 'b']})
    use_standard_tags_col = ColumnSchema(logical_type=Categorical, use_standard_tags=True)
    diff_tags_col = ColumnSchema(logical_type=Categorical, semantic_tags={'new_tag'})

    assert col != diff_description_col
    assert col != diff_metadata_col
    assert col != use_standard_tags_col
    assert col != diff_tags_col

    # Check columns with same logical types but different parameters
    ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c'])
    ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c'])
    ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1)
    ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2)

    assert col != ordinal_col_1
    assert ordinal_col_1 != ordinal_col_2
    assert ordinal_col_1 == ordinal_col_1

    datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d')

    datetime_col_format = ColumnSchema(logical_type=datetime_ltype_instantiated)
    datetime_col_param = ColumnSchema(logical_type=Datetime(datetime_format=None))
    datetime_col_instantiated = ColumnSchema(logical_type=Datetime())
    datetime_col = ColumnSchema(logical_type=Datetime)

    assert datetime_col != datetime_col_instantiated
    assert datetime_col_instantiated != datetime_col_format
    assert datetime_col_instantiated == datetime_col_param
def test_logical_eq():
    assert Boolean == Boolean
    assert Boolean() == Boolean()
    assert Categorical != Boolean
    assert Datetime != Datetime()
    assert Datetime() == Datetime(datetime_format=None)
    assert Datetime() != Datetime(datetime_format='%Y-%m-%d')
Example #3
0
def test_datetime_transform(datetimes):
    datetime = Datetime()
    for series in datetimes:
        assert str(series.dtype) == "object"
        transform = datetime.transform(series)
        assert str(transform.dtype) == "datetime64[ns]"
        assert datetime.datetime_format is not None
Example #4
0
def test_datetime_coerce_user_format():
    datetime = Datetime(datetime_format="%m/%d/%Y")
    dates = pd.Series(["01/01/2017"] * 2 + ["13/12/2017"], name="dates")
    warning = (
        "Some rows in series 'dates' are incompatible with datetime format "
        "'%m/%d/%Y' and have been replaced with null values. You may be able "
        "to fix this by using an instantiated Datetime logical type with a different "
        "format string specified for this column during Woodwork initialization."
    )
    with pytest.warns(TypeConversionWarning, match=warning):
        transformed = datetime.transform(dates)
    assert str(transformed.dtype) == "datetime64[ns]"
    assert transformed[2] is pd.NaT
    assert datetime.datetime_format == "%m/%d/%Y"
Example #5
0
def test_is_col_datetime():
    datetime_column = ColumnSchema(logical_type=Datetime)
    assert _is_col_datetime(datetime_column)

    formatted_datetime_column = ColumnSchema(logical_type=Datetime(datetime_format='%Y-%m%d'))
    assert _is_col_datetime(formatted_datetime_column)

    instantiated_datetime_column = ColumnSchema(logical_type=Datetime())
    assert _is_col_datetime(instantiated_datetime_column)

    nl_column = ColumnSchema(logical_type=NaturalLanguage)
    assert not _is_col_datetime(nl_column)

    double_column = ColumnSchema(logical_type=Double)
    assert not _is_col_datetime(double_column)
Example #6
0
def test_get_ltype_params():
    params_empty_class = _get_specified_ltype_params(Categorical)
    assert params_empty_class == {}
    params_empty = _get_specified_ltype_params(Categorical())
    assert params_empty == {}

    params_class = _get_specified_ltype_params(Datetime)
    assert params_class == {}

    params_null = _get_specified_ltype_params(Datetime())
    assert params_null == {'datetime_format': None}

    ymd = '%Y-%m-%d'
    params_value = _get_specified_ltype_params(Datetime(datetime_format=ymd))
    assert params_value == {'datetime_format': ymd}
def test_select_semantic_tags_no_match(sample_df):
    dt = DataTable(sample_df,
                   time_index='signup_date',
                   index='id',
                   name='dt_name')
    dt = dt.set_types(
        logical_types={
            'full_name': FullName,
            'email': EmailAddress,
            'phone_number': PhoneNumber,
            'signup_date': Datetime(datetime_format='%Y-%m-%d'),
        })
    dt = dt.set_types(
        semantic_tags={
            'full_name': ['new_tag', 'tag2'],
            'age': 'numeric',
            'signup_date': 'date_of_birth',
            'email': 'tag2'
        })

    assert len(dt.select(['doesnt_exist']).columns) == 0

    dt_multiple_unused = dt.select(
        ['doesnt_exist', 'boolean', 'category', PhoneNumber])
    assert len(dt_multiple_unused.columns) == 2

    dt_unused_ltype = dt.select(
        ['date_of_birth', 'doesnt_exist', ZIPCode, Integer])
    assert len(dt_unused_ltype.columns) == 3
Example #8
0
def test_filter_schema_errors(sample_column_names,
                              sample_inferred_logical_types):
    schema = TableSchema(
        sample_column_names,
        sample_inferred_logical_types,
        time_index="signup_date",
        index="id",
        name="df_name",
    )

    err_msg = "Invalid selector used in include: {} must be a string, uninstantiated and registered LogicalType, or valid column name"
    with pytest.raises(TypeError, match=err_msg):
        schema._filter_cols(include=["boolean", "index", Double, {}])

    err_msg = "Invalid selector used in include: {} must be a string, uninstantiated and registered LogicalType, or valid column name"
    with pytest.raises(TypeError, match=err_msg):
        schema._filter_cols(include=["boolean", "index", Double, {}],
                            col_names=True)

    err_msg = "Invalid selector used in include: Datetime cannot be instantiated"
    with pytest.raises(TypeError, match=err_msg):
        schema._filter_cols(Datetime())

    type_system.remove_type(EmailAddress)
    err_msg = "Specified LogicalType selector EmailAddress is not registered in Woodwork's type system."
    with pytest.raises(TypeError, match=err_msg):
        schema._filter_cols(EmailAddress)

    err_msg = "Invalid selector used in include: EmailAddress must be a string, uninstantiated and registered LogicalType, or valid column name"
    with pytest.raises(TypeError, match=err_msg):
        schema._filter_cols(EmailAddress())
    type_system.reset_defaults()
Example #9
0
def test_mutual_info(df_mi):
    df_mi.ww.init(logical_types={'dates': Datetime(datetime_format='%Y-%m-%d')})
    original_df = df_mi.copy()
    mi = df_mi.ww.mutual_information()
    assert mi.shape[0] == 10

    np.testing.assert_almost_equal(mi_between_cols('ints', 'bools', mi), 1.0, 3)
    np.testing.assert_almost_equal(mi_between_cols('ints', 'strs', mi), 0.0, 3)
    np.testing.assert_almost_equal(mi_between_cols('strs', 'bools', mi), 0, 3)
    np.testing.assert_almost_equal(mi_between_cols('dates', 'ints', mi), 0.274, 3)
    np.testing.assert_almost_equal(mi_between_cols('dates', 'bools', mi), 0.274, 3)

    mi_many_rows = df_mi.ww.mutual_information(nrows=100000)
    pd.testing.assert_frame_equal(mi, mi_many_rows)

    mi = df_mi.ww.mutual_information(nrows=1)
    assert mi.shape[0] == 10
    assert (mi['mutual_info'] == 1.0).all()

    mi = df_mi.ww.mutual_information(num_bins=2)
    assert mi.shape[0] == 10
    np.testing.assert_almost_equal(mi_between_cols('bools', 'ints', mi), 0.0, 3)
    np.testing.assert_almost_equal(mi_between_cols('strs', 'ints', mi), 1.0, 3)
    np.testing.assert_almost_equal(mi_between_cols('bools', 'strs', mi), 0, 3)
    np.testing.assert_almost_equal(mi_between_cols('dates', 'strs', mi), 1.0, 3)
    np.testing.assert_almost_equal(mi_between_cols('dates', 'ints', mi), 1.0, 3)

    # Confirm that none of this changed the underlying df
    pd.testing.assert_frame_equal(to_pandas(df_mi), to_pandas(original_df))
Example #10
0
def test_datacolumn_equality(sample_series, sample_datetime_series):
    # Check different parameters to DataColumn
    str_col = DataColumn(sample_series, logical_type='Categorical')
    str_col_2 = DataColumn(sample_series, logical_type=Categorical)
    str_col_diff_tags = DataColumn(sample_series, logical_type=Categorical, semantic_tags={'test'})
    diff_name_col = DataColumn(sample_datetime_series, logical_type=Categorical)
    diff_dtype_col = DataColumn(sample_series, logical_type=NaturalLanguage)
    diff_description_col = DataColumn(sample_series, logical_type='Categorical', description='description')
    diff_metadata_col = DataColumn(sample_series, logical_type='Categorical', metadata={'interesting_values': ['a', 'b']})

    assert str_col == str_col_2
    assert str_col != str_col_diff_tags
    assert str_col != diff_name_col
    assert str_col != diff_dtype_col
    assert str_col != diff_description_col
    assert str_col != diff_metadata_col

    # Check columns with same logical types but different parameters
    ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c'])
    ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c'])
    ordinal_col_1 = DataColumn(sample_series, logical_type=ordinal_ltype_1)
    ordinal_col_2 = DataColumn(sample_series, logical_type=ordinal_ltype_2)

    assert str_col != ordinal_col_1
    assert ordinal_col_1 != ordinal_col_2
    assert ordinal_col_1 == ordinal_col_1

    datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d')
    datetime_col_format = DataColumn(sample_datetime_series, logical_type=datetime_ltype_instantiated)
    datetime_col_param = DataColumn(sample_datetime_series, logical_type=Datetime(datetime_format=None))
    datetime_col_instantiated = DataColumn(sample_datetime_series, logical_type=Datetime())
    datetime_col = DataColumn(sample_datetime_series, logical_type=Datetime)

    assert datetime_col != datetime_col_instantiated
    assert datetime_col_instantiated != datetime_col_format
    assert datetime_col_instantiated == datetime_col_param

    # Check different underlying series
    str_col = DataColumn(sample_series, logical_type='NaturalLanguage')
    changed_series = sample_series.copy().replace(to_replace='a', value='test')
    null_col = DataColumn(changed_series, logical_type='NaturalLanguage')

    # We only check underlying data for equality with pandas dataframes
    if isinstance(str_col.to_series(), pd.Series):
        assert str_col != null_col
    else:
        assert str_col == null_col
Example #11
0
def test_get_ltype_params():
    params_empty_class = _get_specified_ltype_params(Categorical)
    assert params_empty_class == {}
    params_empty = _get_specified_ltype_params(Categorical())
    assert params_empty == {}

    params_class = _get_specified_ltype_params(Datetime)
    assert params_class == {}

    params_null = _get_specified_ltype_params(Datetime())
    assert params_null == {"datetime_format": None, "timezone": None}

    ymd = "%Y-%m-%d"
    params_value = _get_specified_ltype_params(
        Datetime(datetime_format=ymd, timezone="UTC")
    )
    assert params_value == {"datetime_format": ymd, "timezone": "UTC"}
Example #12
0
def test_str_to_logical_type():
    all_types = ww.type_system.registered_types

    with pytest.raises(ValueError,
                       match='String test is not a valid logical type'):
        ww.type_system.str_to_logical_type('test')
    assert ww.type_system.str_to_logical_type('test',
                                              raise_error=False) is None

    for logical_type in all_types:
        assert ww.type_system.str_to_logical_type(
            logical_type.__name__) == logical_type
        assert ww.type_system.str_to_logical_type(
            logical_type.type_string) == logical_type

    assert ww.type_system.str_to_logical_type('bOoLeAn') == Boolean
    assert ww.type_system.str_to_logical_type(
        'person_full_NAME') == PersonFullName
    assert ww.type_system.str_to_logical_type(
        'PersonFullnamE') == PersonFullName

    ymd = '%Y-%m-%d'
    datetime_with_format = ww.type_system.str_to_logical_type(
        'datetime', params={'datetime_format': ymd})
    assert datetime_with_format.__class__ == Datetime
    assert datetime_with_format.datetime_format == ymd
    assert datetime_with_format == Datetime(datetime_format=ymd)

    datetime_no_format = ww.type_system.str_to_logical_type(
        'datetime', params={'datetime_format': None})
    assert datetime_no_format.__class__ == Datetime
    assert datetime_no_format.datetime_format is None
    assert datetime_no_format == Datetime()

    # When parameters are supplied in a non-empty dictionary, the logical type gets instantiated
    assert ww.type_system.str_to_logical_type('person_full_NAME',
                                              params={}) == PersonFullName
    assert datetime_no_format != Datetime

    # Input a different type system
    new_type_sys = TypeSystem()
    with pytest.raises(ValueError,
                       match='String Integer is not a valid logical type'):
        new_type_sys.str_to_logical_type('Integer')
    new_type_sys.add_type(Boolean)
    assert Boolean == new_type_sys.str_to_logical_type('Boolean')
Example #13
0
def test_str_to_logical_type():
    all_types = ww.type_system.registered_types

    with pytest.raises(ValueError,
                       match="String test is not a valid logical type"):
        ww.type_system.str_to_logical_type("test")
    assert ww.type_system.str_to_logical_type("test",
                                              raise_error=False) is None

    for logical_type in all_types:
        assert ww.type_system.str_to_logical_type(
            logical_type.__name__) == logical_type
        assert (ww.type_system.str_to_logical_type(
            logical_type.type_string) == logical_type)

    assert ww.type_system.str_to_logical_type("bOoLeAn") == Boolean
    assert ww.type_system.str_to_logical_type(
        "person_full_NAME") == PersonFullName
    assert ww.type_system.str_to_logical_type(
        "PersonFullnamE") == PersonFullName

    ymd = "%Y-%m-%d"
    datetime_with_format = ww.type_system.str_to_logical_type(
        "datetime", params={"datetime_format": ymd})
    assert datetime_with_format.__class__ == Datetime
    assert datetime_with_format.datetime_format == ymd
    assert datetime_with_format == Datetime(datetime_format=ymd)

    datetime_no_format = ww.type_system.str_to_logical_type(
        "datetime", params={"datetime_format": None})
    assert datetime_no_format.__class__ == Datetime
    assert datetime_no_format.datetime_format is None
    assert datetime_no_format == Datetime()

    # When parameters are supplied in a non-empty dictionary, the logical type gets instantiated
    assert (ww.type_system.str_to_logical_type("person_full_NAME",
                                               params={}) == PersonFullName)
    assert datetime_no_format != Datetime

    # Input a different type system
    new_type_sys = TypeSystem()
    with pytest.raises(ValueError,
                       match="String Integer is not a valid logical type"):
        new_type_sys.str_to_logical_type("Integer")
    new_type_sys.add_type(Boolean)
    assert Boolean == new_type_sys.str_to_logical_type("Boolean")
Example #14
0
def test_is_datetime():
    datetime_column = ColumnSchema(logical_type=Datetime)
    assert datetime_column.is_datetime

    formatted_datetime_column = ColumnSchema(logical_type=Datetime(
        datetime_format='%Y-%m%d'))
    assert formatted_datetime_column.is_datetime

    instantiated_datetime_column = ColumnSchema(logical_type=Datetime())
    assert instantiated_datetime_column.is_datetime

    nl_column = ColumnSchema(logical_type=NaturalLanguage)
    assert not nl_column.is_datetime

    double_column = ColumnSchema(logical_type=Double)
    assert not double_column.is_datetime

    empty_column = ColumnSchema()
    assert not empty_column.is_datetime
Example #15
0
def test_schema_equality():
    col = ColumnSchema(logical_type=Categorical)
    diff_description_col = ColumnSchema(logical_type=Categorical,
                                        description="description")
    diff_origin_col = ColumnSchema(logical_type=Categorical, origin="base")
    diff_metadata_col = ColumnSchema(
        logical_type=Categorical, metadata={"interesting_values": ["a", "b"]})
    use_standard_tags_col = ColumnSchema(logical_type=Categorical,
                                         use_standard_tags=True)
    diff_tags_col = ColumnSchema(logical_type=Categorical,
                                 semantic_tags={"new_tag"})

    assert col != diff_description_col
    assert col != diff_origin_col
    assert col != diff_metadata_col
    assert col != use_standard_tags_col
    assert col != diff_tags_col

    # Check columns with same logical types but different parameters
    ordinal_ltype_1 = Ordinal(order=["a", "b", "c"])
    ordinal_ltype_2 = Ordinal(order=["b", "a", "c"])
    ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1)
    ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2)

    assert col != ordinal_col_1
    assert ordinal_col_1 != ordinal_col_2
    assert ordinal_col_1 == ordinal_col_1

    datetime_ltype_instantiated = Datetime(datetime_format="%Y-%m%d")

    datetime_col_format = ColumnSchema(
        logical_type=datetime_ltype_instantiated)
    datetime_col_param = ColumnSchema(logical_type=Datetime(
        datetime_format=None))
    datetime_col_instantiated = ColumnSchema(logical_type=Datetime())

    assert datetime_col_instantiated != datetime_col_format
    assert datetime_col_instantiated == datetime_col_param
Example #16
0
def test_schema_types(sample_column_names, sample_inferred_logical_types):
    sample_column_names.append("formatted_date")

    ymd_format = Datetime(datetime_format="%Y~%m~%d")
    schema = TableSchema(
        sample_column_names,
        logical_types={
            **sample_inferred_logical_types, "formatted_date": ymd_format
        },
        use_standard_tags=True,
    )

    returned_types = schema.types
    assert isinstance(returned_types, pd.DataFrame)
    assert "Logical Type" in returned_types.columns
    assert "Semantic Tag(s)" in returned_types.columns
    assert returned_types.shape[1] == 2
    assert len(returned_types.index) == len(sample_column_names)
    correct_logical_types = {
        name: ltype()
        for name, ltype in sample_inferred_logical_types.items()
    }
    correct_logical_types["formatted_date"] = ymd_format
    correct_logical_types = pd.Series(list(correct_logical_types.values()),
                                      index=list(correct_logical_types.keys()))
    assert correct_logical_types.equals(returned_types["Logical Type"])

    correct_semantic_tags = {
        "id": "['numeric']",
        "full_name": "[]",
        "email": "[]",
        "phone_number": "[]",
        "age": "['numeric']",
        "signup_date": "[]",
        "is_registered": "[]",
        "double": "['numeric']",
        "double_with_nan": "['numeric']",
        "integer": "['numeric']",
        "nullable_integer": "['numeric']",
        "boolean": "[]",
        "categorical": "['category']",
        "datetime_with_NaT": "[]",
        "url": "[]",
        "ip_address": "[]",
        "formatted_date": "[]",
    }
    correct_semantic_tags = pd.Series(list(correct_semantic_tags.values()),
                                      index=list(correct_semantic_tags.keys()))
    assert correct_semantic_tags.equals(returned_types["Semantic Tag(s)"])
Example #17
0
def test_schema_types(sample_column_names, sample_inferred_logical_types):
    sample_column_names.append('formatted_date')

    ymd_format = Datetime(datetime_format='%Y~%m~%d')
    schema = TableSchema(sample_column_names,
                         logical_types={
                             **sample_inferred_logical_types, 'formatted_date':
                             ymd_format
                         },
                         use_standard_tags=True)

    returned_types = schema.types
    assert isinstance(returned_types, pd.DataFrame)
    assert 'Logical Type' in returned_types.columns
    assert 'Semantic Tag(s)' in returned_types.columns
    assert returned_types.shape[1] == 2
    assert len(returned_types.index) == len(sample_column_names)
    correct_logical_types = {
        'id': Integer,
        'full_name': NaturalLanguage,
        'email': NaturalLanguage,
        'phone_number': NaturalLanguage,
        'age': Integer,
        'signup_date': Datetime,
        'is_registered': Boolean,
        'formatted_date': ymd_format
    }
    correct_logical_types = pd.Series(list(correct_logical_types.values()),
                                      index=list(correct_logical_types.keys()))
    assert correct_logical_types.equals(returned_types['Logical Type'])

    correct_semantic_tags = {
        'id': "['numeric']",
        'full_name': "[]",
        'email': "[]",
        'phone_number': "[]",
        'age': "['numeric']",
        'signup_date': "[]",
        'is_registered': "[]",
        'formatted_date': "[]",
    }
    correct_semantic_tags = pd.Series(list(correct_semantic_tags.values()),
                                      index=list(correct_semantic_tags.keys()))
    assert correct_semantic_tags.equals(returned_types['Semantic Tag(s)'])
Example #18
0
def make_logical_types(with_integer_time_index=False):
    region_logical_types = {'id': Categorical, 'language': Categorical}

    store_logical_types = {'id': Integer, u'région_id': Categorical}

    product_logical_types = {
        'id': Categorical,
        'rating': Double,
        'department': Categorical,
        'url': URL,
    }

    customer_logical_types = {
        'id': Integer,
        'age': Integer,
        u'région_id': Categorical,
        'loves_ice_cream': Boolean,
        'favorite_quote': NaturalLanguage,
        'signup_date': Datetime(datetime_format='%Y-%m-%d'),
        'upgrade_date': Datetime(datetime_format='%Y-%m-%d'),
        'cancel_date': Datetime(datetime_format='%Y-%m-%d'),
        'cancel_reason': Categorical,
        'engagement_level': Ordinal(order=[1, 2, 3]),
        'full_name': PersonFullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'date_of_birth': Datetime(datetime_format='%Y-%m-%d'),
        'cohort_name': Categorical,
    }

    session_logical_types = {
        'id': Integer,
        'customer_id': Integer,
        'device_type': Categorical,
        'device_name': Categorical,
        'ip': IPAddress,
        'filepath': Filepath,
    }

    log_logical_types = {
        'id': Integer,
        'session_id': Integer,
        'product_id': Categorical,
        'datetime': Datetime(datetime_format='%Y-%m-%d'),
        'value': Double,
        'value_2': Double,
        'latlong': LatLong,
        'latlong2': LatLong,
        'zipcode': PostalCode,
        'countrycode': CountryCode,
        'subregioncode': SubRegionCode,
        'value_many_nans': Double,
        'priority_level': Ordinal(order=[0, 1, 2]),
        'purchased': Boolean,
        'url': URL,
        'email_address': EmailAddress,
        'comments': NaturalLanguage
    }
    if with_integer_time_index:
        log_logical_types['datetime'] = Integer
        customer_logical_types['signup_date'] = Integer
        customer_logical_types['upgrade_date'] = Integer
        customer_logical_types['cancel_date'] = Integer
        customer_logical_types['date_of_birth'] = Integer

    return {
        'customers': customer_logical_types,
        'sessions': session_logical_types,
        'log': log_logical_types,
        'products': product_logical_types,
        'stores': store_logical_types,
        u'régions': region_logical_types
    }
Example #19
0
def make_logical_types(with_integer_time_index=False):
    region_logical_types = {"id": Categorical, "language": Categorical}

    store_logical_types = {"id": Integer, "région_id": Categorical}

    product_logical_types = {
        "id": Categorical,
        "rating": Double,
        "department": Categorical,
        "url": URL,
    }

    customer_logical_types = {
        "id": Integer,
        "age": Integer,
        "région_id": Categorical,
        "loves_ice_cream": Boolean,
        "favorite_quote": NaturalLanguage,
        "signup_date": Datetime(datetime_format="%Y-%m-%d"),
        "upgrade_date": Datetime(datetime_format="%Y-%m-%d"),
        "cancel_date": Datetime(datetime_format="%Y-%m-%d"),
        "cancel_reason": Categorical,
        "engagement_level": Ordinal(order=[1, 2, 3]),
        "full_name": PersonFullName,
        "email": EmailAddress,
        "phone_number": PhoneNumber,
        "birthday": Datetime(datetime_format="%Y-%m-%d"),
        "cohort_name": Categorical,
    }

    session_logical_types = {
        "id": Integer,
        "customer_id": Integer,
        "device_type": Categorical,
        "device_name": Categorical,
        "ip": IPAddress,
        "filepath": Filepath,
    }

    log_logical_types = {
        "id": Integer,
        "session_id": Integer,
        "product_id": Categorical,
        "datetime": Datetime(datetime_format="%Y-%m-%d"),
        "value": Double,
        "value_2": Double,
        "latlong": LatLong,
        "latlong2": LatLong,
        "zipcode": PostalCode,
        "countrycode": CountryCode,
        "subregioncode": SubRegionCode,
        "value_many_nans": Double,
        "priority_level": Ordinal(order=[0, 1, 2]),
        "purchased": Boolean,
        "url": URL,
        "email_address": EmailAddress,
        "comments": NaturalLanguage,
    }
    if with_integer_time_index:
        log_logical_types["datetime"] = Integer
        customer_logical_types["signup_date"] = Integer
        customer_logical_types["upgrade_date"] = Integer
        customer_logical_types["cancel_date"] = Integer
        customer_logical_types["birthday"] = Integer

    return {
        "customers": customer_logical_types,
        "sessions": session_logical_types,
        "log": log_logical_types,
        "products": product_logical_types,
        "stores": store_logical_types,
        "régions": region_logical_types,
    }
Example #20
0
def test_parse_logical_type():
    assert _parse_logical_type('Datetime', 'col_name') == Datetime
    assert _parse_logical_type(Datetime, 'col_name') == Datetime

    ymd_format = Datetime(datetime_format='%Y-%m-%d')
    assert _parse_logical_type(ymd_format, 'col_name') == ymd_format
Example #21
0
def test_parse_logical_type():
    assert isinstance(_parse_logical_type("Datetime", "col_name"), Datetime)
    assert isinstance(_parse_logical_type(Datetime, "col_name"), Datetime)

    ymd_format = Datetime(datetime_format="%Y-%m-%d")
    assert _parse_logical_type(ymd_format, "col_name") == ymd_format
Example #22
0
def test_describe_accessor_method(describe_df):
    categorical_ltypes = [
        Categorical, CountryCode,
        Ordinal(order=('yellow', 'red', 'blue')), PostalCode, SubRegionCode
    ]
    boolean_ltypes = [BooleanNullable]
    non_nullable_boolean_ltypes = [Boolean]
    datetime_ltypes = [Datetime]
    formatted_datetime_ltypes = [Datetime(datetime_format='%Y~%m~%d')]
    timedelta_ltypes = [Timedelta]
    nullable_numeric_ltypes = [Double, IntegerNullable]
    non_nullable_numeric_ltypes = [Integer]
    natural_language_ltypes = [
        EmailAddress, Filepath, PersonFullName, IPAddress, PhoneNumber, URL
    ]
    latlong_ltypes = [LatLong]

    expected_index = [
        'physical_type', 'logical_type', 'semantic_tags', 'count', 'nunique',
        'nan_count', 'mean', 'mode', 'std', 'min', 'first_quartile',
        'second_quartile', 'third_quartile', 'max', 'num_true', 'num_false'
    ]

    # Test categorical columns
    category_data = describe_df[['category_col']]
    if ks and isinstance(category_data, ks.DataFrame):
        expected_dtype = 'string'
    else:
        expected_dtype = 'category'

    for ltype in categorical_ltypes:
        expected_vals = pd.Series(
            {
                'physical_type': expected_dtype,
                'logical_type': ltype,
                'semantic_tags': {'category', 'custom_tag'},
                'count': 7,
                'nunique': 3,
                'nan_count': 1,
                'mode': 'red'
            },
            name='category_col')
        category_data.ww.init(logical_types={'category_col': ltype},
                              semantic_tags={'category_col': 'custom_tag'})
        stats_df = category_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'category_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(stats_df['category_col'].dropna())

    # Test nullable boolean columns
    boolean_data = describe_df[['boolean_col']]
    for ltype in boolean_ltypes:
        expected_dtype = ltype.primary_dtype
        expected_vals = pd.Series(
            {
                'physical_type': expected_dtype,
                'logical_type': ltype,
                'semantic_tags': {'custom_tag'},
                'count': 7,
                'nan_count': 1,
                'mode': True,
                'num_true': 4,
                'num_false': 3
            },
            name='boolean_col')
        boolean_data.ww.init(logical_types={'boolean_col': ltype},
                             semantic_tags={'boolean_col': 'custom_tag'})
        stats_df = boolean_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'boolean_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(stats_df['boolean_col'].dropna())

    # Test non-nullable boolean columns
    boolean_data = describe_df[['boolean_col']].fillna(True)
    for ltype in non_nullable_boolean_ltypes:
        expected_dtype = ltype.primary_dtype
        expected_vals = pd.Series(
            {
                'physical_type': expected_dtype,
                'logical_type': ltype,
                'semantic_tags': {'custom_tag'},
                'count': 8,
                'nan_count': 0,
                'mode': True,
                'num_true': 5,
                'num_false': 3
            },
            name='boolean_col')
        boolean_data.ww.init(logical_types={'boolean_col': ltype},
                             semantic_tags={'boolean_col': 'custom_tag'})
        stats_df = boolean_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'boolean_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(stats_df['boolean_col'].dropna())

    # Test datetime columns
    datetime_data = describe_df[['datetime_col']]
    for ltype in datetime_ltypes:
        expected_vals = pd.Series(
            {
                'physical_type': ltype.primary_dtype,
                'logical_type': ltype,
                'semantic_tags': {'custom_tag'},
                'count': 7,
                'nunique': 6,
                'nan_count': 1,
                'mean': pd.Timestamp('2020-01-19 09:25:42.857142784'),
                'mode': pd.Timestamp('2020-02-01 00:00:00'),
                'min': pd.Timestamp('2020-01-01 00:00:00'),
                'max': pd.Timestamp('2020-02-02 18:00:00')
            },
            name='datetime_col')
        datetime_data.ww.init(logical_types={'datetime_col': ltype},
                              semantic_tags={'datetime_col': 'custom_tag'})
        stats_df = datetime_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'datetime_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(stats_df['datetime_col'].dropna())

    # Test formatted datetime columns
    formatted_datetime_data = describe_df[['formatted_datetime_col']]
    for ltype in formatted_datetime_ltypes:
        converted_to_datetime = pd.to_datetime([
            '2020-01-01', '2020-02-01', '2020-03-01', '2020-02-02',
            '2020-03-02', pd.NaT, '2020-02-01', '2020-01-02'
        ])
        expected_vals = pd.Series(
            {
                'physical_type': ltype.primary_dtype,
                'logical_type': ltype,
                'semantic_tags': {'custom_tag'},
                'count': 7,
                'nunique': 6,
                'nan_count': 1,
                'mean': converted_to_datetime.mean(),
                'mode': pd.to_datetime('2020-02-01'),
                'min': converted_to_datetime.min(),
                'max': converted_to_datetime.max()
            },
            name='formatted_datetime_col')
        formatted_datetime_data.ww.init(
            logical_types={'formatted_datetime_col': ltype},
            semantic_tags={'formatted_datetime_col': 'custom_tag'})
        stats_df = formatted_datetime_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'formatted_datetime_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(
            stats_df['formatted_datetime_col'].dropna())

    # Test timedelta columns - Skip for Koalas
    if not (ks and isinstance(describe_df, ks.DataFrame)):
        timedelta_data = describe_df['timedelta_col']
        for ltype in timedelta_ltypes:
            expected_vals = pd.Series(
                {
                    'physical_type': ltype.primary_dtype,
                    'logical_type': ltype,
                    'semantic_tags': {'custom_tag'},
                    'count': 7,
                    'nan_count': 1,
                    'mode': pd.Timedelta('31days')
                },
                name='col')
            df = pd.DataFrame({'col': timedelta_data})
            df.ww.init(logical_types={'col': ltype},
                       semantic_tags={'col': 'custom_tag'})
            stats_df = df.ww.describe()
            assert isinstance(stats_df, pd.DataFrame)
            assert set(stats_df.columns) == {'col'}
            assert stats_df.index.tolist() == expected_index
            assert expected_vals.equals(stats_df['col'].dropna())

    # Test numeric columns with nullable ltypes
    numeric_data = describe_df[['numeric_col']]
    for ltype in nullable_numeric_ltypes:
        expected_vals = pd.Series(
            {
                'physical_type': ltype.primary_dtype,
                'logical_type': ltype,
                'semantic_tags': {'numeric', 'custom_tag'},
                'count': 7,
                'nunique': 6,
                'nan_count': 1,
                'mean': 20.857142857142858,
                'mode': 10,
                'std': 18.27957486220227,
                'min': 1,
                'first_quartile': 10,
                'second_quartile': 17,
                'third_quartile': 26,
                'max': 56
            },
            name='numeric_col')
        numeric_data.ww.init(logical_types={'numeric_col': ltype},
                             semantic_tags={'numeric_col': 'custom_tag'})
        stats_df = numeric_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'numeric_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(stats_df['numeric_col'].dropna())

    # Test numeric with non-nullable ltypes
    numeric_data = describe_df[['numeric_col']].fillna(0)
    for ltype in non_nullable_numeric_ltypes:
        expected_vals = pd.Series(
            {
                'physical_type': ltype.primary_dtype,
                'logical_type': ltype,
                'semantic_tags': {'numeric', 'custom_tag'},
                'count': 8,
                'nunique': 7,
                'nan_count': 0,
                'mean': 18.25,
                'mode': 10,
                'std': 18.460382289804137,
                'min': 0,
                'first_quartile': 7.75,
                'second_quartile': 13.5,
                'third_quartile': 23,
                'max': 56
            },
            name='numeric_col')
        numeric_data.ww.init(logical_types={'numeric_col': ltype},
                             semantic_tags={'numeric_col': 'custom_tag'})
        stats_df = numeric_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'numeric_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(stats_df['numeric_col'].dropna())

    # Test natural language columns
    natural_language_data = describe_df[['natural_language_col']]
    expected_dtype = 'string'
    for ltype in natural_language_ltypes:
        expected_vals = pd.Series(
            {
                'physical_type': expected_dtype,
                'logical_type': ltype,
                'semantic_tags': {'custom_tag'},
                'count': 7,
                'nan_count': 1,
                'mode': 'Duplicate sentence.'
            },
            name='natural_language_col')
        natural_language_data.ww.init(
            logical_types={'natural_language_col': ltype},
            semantic_tags={'natural_language_col': 'custom_tag'})
        stats_df = natural_language_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'natural_language_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(stats_df['natural_language_col'].dropna())

    # Test latlong columns
    latlong_data = describe_df[['latlong_col']]
    expected_dtype = 'object'
    for ltype in latlong_ltypes:
        mode = [0, 0] if ks and isinstance(describe_df, ks.DataFrame) else (0,
                                                                            0)
        expected_vals = pd.Series(
            {
                'physical_type': expected_dtype,
                'logical_type': ltype,
                'semantic_tags': {'custom_tag'},
                'count': 6,
                'nan_count': 2,
                'mode': mode
            },
            name='latlong_col')
        latlong_data.ww.init(logical_types={'latlong_col': ltype},
                             semantic_tags={'latlong_col': 'custom_tag'})
        stats_df = latlong_data.ww.describe()
        assert isinstance(stats_df, pd.DataFrame)
        assert set(stats_df.columns) == {'latlong_col'}
        assert stats_df.index.tolist() == expected_index
        assert expected_vals.equals(stats_df['latlong_col'].dropna())