def test_schema_equality(): col = ColumnSchema(logical_type=Categorical) diff_description_col = ColumnSchema(logical_type=Categorical, description='description') diff_metadata_col = ColumnSchema(logical_type=Categorical, metadata={'interesting_values': ['a', 'b']}) use_standard_tags_col = ColumnSchema(logical_type=Categorical, use_standard_tags=True) diff_tags_col = ColumnSchema(logical_type=Categorical, semantic_tags={'new_tag'}) assert col != diff_description_col assert col != diff_metadata_col assert col != use_standard_tags_col assert col != diff_tags_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c']) ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c']) ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1) ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2) assert col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d') datetime_col_format = ColumnSchema(logical_type=datetime_ltype_instantiated) datetime_col_param = ColumnSchema(logical_type=Datetime(datetime_format=None)) datetime_col_instantiated = ColumnSchema(logical_type=Datetime()) datetime_col = ColumnSchema(logical_type=Datetime) assert datetime_col != datetime_col_instantiated assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param
def test_logical_eq(): assert Boolean == Boolean assert Boolean() == Boolean() assert Categorical != Boolean assert Datetime != Datetime() assert Datetime() == Datetime(datetime_format=None) assert Datetime() != Datetime(datetime_format='%Y-%m-%d')
def test_datetime_transform(datetimes): datetime = Datetime() for series in datetimes: assert str(series.dtype) == "object" transform = datetime.transform(series) assert str(transform.dtype) == "datetime64[ns]" assert datetime.datetime_format is not None
def test_datetime_coerce_user_format(): datetime = Datetime(datetime_format="%m/%d/%Y") dates = pd.Series(["01/01/2017"] * 2 + ["13/12/2017"], name="dates") warning = ( "Some rows in series 'dates' are incompatible with datetime format " "'%m/%d/%Y' and have been replaced with null values. You may be able " "to fix this by using an instantiated Datetime logical type with a different " "format string specified for this column during Woodwork initialization." ) with pytest.warns(TypeConversionWarning, match=warning): transformed = datetime.transform(dates) assert str(transformed.dtype) == "datetime64[ns]" assert transformed[2] is pd.NaT assert datetime.datetime_format == "%m/%d/%Y"
def test_is_col_datetime(): datetime_column = ColumnSchema(logical_type=Datetime) assert _is_col_datetime(datetime_column) formatted_datetime_column = ColumnSchema(logical_type=Datetime(datetime_format='%Y-%m%d')) assert _is_col_datetime(formatted_datetime_column) instantiated_datetime_column = ColumnSchema(logical_type=Datetime()) assert _is_col_datetime(instantiated_datetime_column) nl_column = ColumnSchema(logical_type=NaturalLanguage) assert not _is_col_datetime(nl_column) double_column = ColumnSchema(logical_type=Double) assert not _is_col_datetime(double_column)
def test_get_ltype_params(): params_empty_class = _get_specified_ltype_params(Categorical) assert params_empty_class == {} params_empty = _get_specified_ltype_params(Categorical()) assert params_empty == {} params_class = _get_specified_ltype_params(Datetime) assert params_class == {} params_null = _get_specified_ltype_params(Datetime()) assert params_null == {'datetime_format': None} ymd = '%Y-%m-%d' params_value = _get_specified_ltype_params(Datetime(datetime_format=ymd)) assert params_value == {'datetime_format': ymd}
def test_select_semantic_tags_no_match(sample_df): dt = DataTable(sample_df, time_index='signup_date', index='id', name='dt_name') dt = dt.set_types( logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'signup_date': Datetime(datetime_format='%Y-%m-%d'), }) dt = dt.set_types( semantic_tags={ 'full_name': ['new_tag', 'tag2'], 'age': 'numeric', 'signup_date': 'date_of_birth', 'email': 'tag2' }) assert len(dt.select(['doesnt_exist']).columns) == 0 dt_multiple_unused = dt.select( ['doesnt_exist', 'boolean', 'category', PhoneNumber]) assert len(dt_multiple_unused.columns) == 2 dt_unused_ltype = dt.select( ['date_of_birth', 'doesnt_exist', ZIPCode, Integer]) assert len(dt_unused_ltype.columns) == 3
def test_filter_schema_errors(sample_column_names, sample_inferred_logical_types): schema = TableSchema( sample_column_names, sample_inferred_logical_types, time_index="signup_date", index="id", name="df_name", ) err_msg = "Invalid selector used in include: {} must be a string, uninstantiated and registered LogicalType, or valid column name" with pytest.raises(TypeError, match=err_msg): schema._filter_cols(include=["boolean", "index", Double, {}]) err_msg = "Invalid selector used in include: {} must be a string, uninstantiated and registered LogicalType, or valid column name" with pytest.raises(TypeError, match=err_msg): schema._filter_cols(include=["boolean", "index", Double, {}], col_names=True) err_msg = "Invalid selector used in include: Datetime cannot be instantiated" with pytest.raises(TypeError, match=err_msg): schema._filter_cols(Datetime()) type_system.remove_type(EmailAddress) err_msg = "Specified LogicalType selector EmailAddress is not registered in Woodwork's type system." with pytest.raises(TypeError, match=err_msg): schema._filter_cols(EmailAddress) err_msg = "Invalid selector used in include: EmailAddress must be a string, uninstantiated and registered LogicalType, or valid column name" with pytest.raises(TypeError, match=err_msg): schema._filter_cols(EmailAddress()) type_system.reset_defaults()
def test_mutual_info(df_mi): df_mi.ww.init(logical_types={'dates': Datetime(datetime_format='%Y-%m-%d')}) original_df = df_mi.copy() mi = df_mi.ww.mutual_information() assert mi.shape[0] == 10 np.testing.assert_almost_equal(mi_between_cols('ints', 'bools', mi), 1.0, 3) np.testing.assert_almost_equal(mi_between_cols('ints', 'strs', mi), 0.0, 3) np.testing.assert_almost_equal(mi_between_cols('strs', 'bools', mi), 0, 3) np.testing.assert_almost_equal(mi_between_cols('dates', 'ints', mi), 0.274, 3) np.testing.assert_almost_equal(mi_between_cols('dates', 'bools', mi), 0.274, 3) mi_many_rows = df_mi.ww.mutual_information(nrows=100000) pd.testing.assert_frame_equal(mi, mi_many_rows) mi = df_mi.ww.mutual_information(nrows=1) assert mi.shape[0] == 10 assert (mi['mutual_info'] == 1.0).all() mi = df_mi.ww.mutual_information(num_bins=2) assert mi.shape[0] == 10 np.testing.assert_almost_equal(mi_between_cols('bools', 'ints', mi), 0.0, 3) np.testing.assert_almost_equal(mi_between_cols('strs', 'ints', mi), 1.0, 3) np.testing.assert_almost_equal(mi_between_cols('bools', 'strs', mi), 0, 3) np.testing.assert_almost_equal(mi_between_cols('dates', 'strs', mi), 1.0, 3) np.testing.assert_almost_equal(mi_between_cols('dates', 'ints', mi), 1.0, 3) # Confirm that none of this changed the underlying df pd.testing.assert_frame_equal(to_pandas(df_mi), to_pandas(original_df))
def test_datacolumn_equality(sample_series, sample_datetime_series): # Check different parameters to DataColumn str_col = DataColumn(sample_series, logical_type='Categorical') str_col_2 = DataColumn(sample_series, logical_type=Categorical) str_col_diff_tags = DataColumn(sample_series, logical_type=Categorical, semantic_tags={'test'}) diff_name_col = DataColumn(sample_datetime_series, logical_type=Categorical) diff_dtype_col = DataColumn(sample_series, logical_type=NaturalLanguage) diff_description_col = DataColumn(sample_series, logical_type='Categorical', description='description') diff_metadata_col = DataColumn(sample_series, logical_type='Categorical', metadata={'interesting_values': ['a', 'b']}) assert str_col == str_col_2 assert str_col != str_col_diff_tags assert str_col != diff_name_col assert str_col != diff_dtype_col assert str_col != diff_description_col assert str_col != diff_metadata_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c']) ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c']) ordinal_col_1 = DataColumn(sample_series, logical_type=ordinal_ltype_1) ordinal_col_2 = DataColumn(sample_series, logical_type=ordinal_ltype_2) assert str_col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d') datetime_col_format = DataColumn(sample_datetime_series, logical_type=datetime_ltype_instantiated) datetime_col_param = DataColumn(sample_datetime_series, logical_type=Datetime(datetime_format=None)) datetime_col_instantiated = DataColumn(sample_datetime_series, logical_type=Datetime()) datetime_col = DataColumn(sample_datetime_series, logical_type=Datetime) assert datetime_col != datetime_col_instantiated assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param # Check different underlying series str_col = DataColumn(sample_series, logical_type='NaturalLanguage') changed_series = sample_series.copy().replace(to_replace='a', value='test') null_col = DataColumn(changed_series, logical_type='NaturalLanguage') # We only check underlying data for equality with pandas dataframes if isinstance(str_col.to_series(), pd.Series): assert str_col != null_col else: assert str_col == null_col
def test_get_ltype_params(): params_empty_class = _get_specified_ltype_params(Categorical) assert params_empty_class == {} params_empty = _get_specified_ltype_params(Categorical()) assert params_empty == {} params_class = _get_specified_ltype_params(Datetime) assert params_class == {} params_null = _get_specified_ltype_params(Datetime()) assert params_null == {"datetime_format": None, "timezone": None} ymd = "%Y-%m-%d" params_value = _get_specified_ltype_params( Datetime(datetime_format=ymd, timezone="UTC") ) assert params_value == {"datetime_format": ymd, "timezone": "UTC"}
def test_str_to_logical_type(): all_types = ww.type_system.registered_types with pytest.raises(ValueError, match='String test is not a valid logical type'): ww.type_system.str_to_logical_type('test') assert ww.type_system.str_to_logical_type('test', raise_error=False) is None for logical_type in all_types: assert ww.type_system.str_to_logical_type( logical_type.__name__) == logical_type assert ww.type_system.str_to_logical_type( logical_type.type_string) == logical_type assert ww.type_system.str_to_logical_type('bOoLeAn') == Boolean assert ww.type_system.str_to_logical_type( 'person_full_NAME') == PersonFullName assert ww.type_system.str_to_logical_type( 'PersonFullnamE') == PersonFullName ymd = '%Y-%m-%d' datetime_with_format = ww.type_system.str_to_logical_type( 'datetime', params={'datetime_format': ymd}) assert datetime_with_format.__class__ == Datetime assert datetime_with_format.datetime_format == ymd assert datetime_with_format == Datetime(datetime_format=ymd) datetime_no_format = ww.type_system.str_to_logical_type( 'datetime', params={'datetime_format': None}) assert datetime_no_format.__class__ == Datetime assert datetime_no_format.datetime_format is None assert datetime_no_format == Datetime() # When parameters are supplied in a non-empty dictionary, the logical type gets instantiated assert ww.type_system.str_to_logical_type('person_full_NAME', params={}) == PersonFullName assert datetime_no_format != Datetime # Input a different type system new_type_sys = TypeSystem() with pytest.raises(ValueError, match='String Integer is not a valid logical type'): new_type_sys.str_to_logical_type('Integer') new_type_sys.add_type(Boolean) assert Boolean == new_type_sys.str_to_logical_type('Boolean')
def test_str_to_logical_type(): all_types = ww.type_system.registered_types with pytest.raises(ValueError, match="String test is not a valid logical type"): ww.type_system.str_to_logical_type("test") assert ww.type_system.str_to_logical_type("test", raise_error=False) is None for logical_type in all_types: assert ww.type_system.str_to_logical_type( logical_type.__name__) == logical_type assert (ww.type_system.str_to_logical_type( logical_type.type_string) == logical_type) assert ww.type_system.str_to_logical_type("bOoLeAn") == Boolean assert ww.type_system.str_to_logical_type( "person_full_NAME") == PersonFullName assert ww.type_system.str_to_logical_type( "PersonFullnamE") == PersonFullName ymd = "%Y-%m-%d" datetime_with_format = ww.type_system.str_to_logical_type( "datetime", params={"datetime_format": ymd}) assert datetime_with_format.__class__ == Datetime assert datetime_with_format.datetime_format == ymd assert datetime_with_format == Datetime(datetime_format=ymd) datetime_no_format = ww.type_system.str_to_logical_type( "datetime", params={"datetime_format": None}) assert datetime_no_format.__class__ == Datetime assert datetime_no_format.datetime_format is None assert datetime_no_format == Datetime() # When parameters are supplied in a non-empty dictionary, the logical type gets instantiated assert (ww.type_system.str_to_logical_type("person_full_NAME", params={}) == PersonFullName) assert datetime_no_format != Datetime # Input a different type system new_type_sys = TypeSystem() with pytest.raises(ValueError, match="String Integer is not a valid logical type"): new_type_sys.str_to_logical_type("Integer") new_type_sys.add_type(Boolean) assert Boolean == new_type_sys.str_to_logical_type("Boolean")
def test_is_datetime(): datetime_column = ColumnSchema(logical_type=Datetime) assert datetime_column.is_datetime formatted_datetime_column = ColumnSchema(logical_type=Datetime( datetime_format='%Y-%m%d')) assert formatted_datetime_column.is_datetime instantiated_datetime_column = ColumnSchema(logical_type=Datetime()) assert instantiated_datetime_column.is_datetime nl_column = ColumnSchema(logical_type=NaturalLanguage) assert not nl_column.is_datetime double_column = ColumnSchema(logical_type=Double) assert not double_column.is_datetime empty_column = ColumnSchema() assert not empty_column.is_datetime
def test_schema_equality(): col = ColumnSchema(logical_type=Categorical) diff_description_col = ColumnSchema(logical_type=Categorical, description="description") diff_origin_col = ColumnSchema(logical_type=Categorical, origin="base") diff_metadata_col = ColumnSchema( logical_type=Categorical, metadata={"interesting_values": ["a", "b"]}) use_standard_tags_col = ColumnSchema(logical_type=Categorical, use_standard_tags=True) diff_tags_col = ColumnSchema(logical_type=Categorical, semantic_tags={"new_tag"}) assert col != diff_description_col assert col != diff_origin_col assert col != diff_metadata_col assert col != use_standard_tags_col assert col != diff_tags_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=["a", "b", "c"]) ordinal_ltype_2 = Ordinal(order=["b", "a", "c"]) ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1) ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2) assert col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format="%Y-%m%d") datetime_col_format = ColumnSchema( logical_type=datetime_ltype_instantiated) datetime_col_param = ColumnSchema(logical_type=Datetime( datetime_format=None)) datetime_col_instantiated = ColumnSchema(logical_type=Datetime()) assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param
def test_schema_types(sample_column_names, sample_inferred_logical_types): sample_column_names.append("formatted_date") ymd_format = Datetime(datetime_format="%Y~%m~%d") schema = TableSchema( sample_column_names, logical_types={ **sample_inferred_logical_types, "formatted_date": ymd_format }, use_standard_tags=True, ) returned_types = schema.types assert isinstance(returned_types, pd.DataFrame) assert "Logical Type" in returned_types.columns assert "Semantic Tag(s)" in returned_types.columns assert returned_types.shape[1] == 2 assert len(returned_types.index) == len(sample_column_names) correct_logical_types = { name: ltype() for name, ltype in sample_inferred_logical_types.items() } correct_logical_types["formatted_date"] = ymd_format correct_logical_types = pd.Series(list(correct_logical_types.values()), index=list(correct_logical_types.keys())) assert correct_logical_types.equals(returned_types["Logical Type"]) correct_semantic_tags = { "id": "['numeric']", "full_name": "[]", "email": "[]", "phone_number": "[]", "age": "['numeric']", "signup_date": "[]", "is_registered": "[]", "double": "['numeric']", "double_with_nan": "['numeric']", "integer": "['numeric']", "nullable_integer": "['numeric']", "boolean": "[]", "categorical": "['category']", "datetime_with_NaT": "[]", "url": "[]", "ip_address": "[]", "formatted_date": "[]", } correct_semantic_tags = pd.Series(list(correct_semantic_tags.values()), index=list(correct_semantic_tags.keys())) assert correct_semantic_tags.equals(returned_types["Semantic Tag(s)"])
def test_schema_types(sample_column_names, sample_inferred_logical_types): sample_column_names.append('formatted_date') ymd_format = Datetime(datetime_format='%Y~%m~%d') schema = TableSchema(sample_column_names, logical_types={ **sample_inferred_logical_types, 'formatted_date': ymd_format }, use_standard_tags=True) returned_types = schema.types assert isinstance(returned_types, pd.DataFrame) assert 'Logical Type' in returned_types.columns assert 'Semantic Tag(s)' in returned_types.columns assert returned_types.shape[1] == 2 assert len(returned_types.index) == len(sample_column_names) correct_logical_types = { 'id': Integer, 'full_name': NaturalLanguage, 'email': NaturalLanguage, 'phone_number': NaturalLanguage, 'age': Integer, 'signup_date': Datetime, 'is_registered': Boolean, 'formatted_date': ymd_format } correct_logical_types = pd.Series(list(correct_logical_types.values()), index=list(correct_logical_types.keys())) assert correct_logical_types.equals(returned_types['Logical Type']) correct_semantic_tags = { 'id': "['numeric']", 'full_name': "[]", 'email': "[]", 'phone_number': "[]", 'age': "['numeric']", 'signup_date': "[]", 'is_registered': "[]", 'formatted_date': "[]", } correct_semantic_tags = pd.Series(list(correct_semantic_tags.values()), index=list(correct_semantic_tags.keys())) assert correct_semantic_tags.equals(returned_types['Semantic Tag(s)'])
def make_logical_types(with_integer_time_index=False): region_logical_types = {'id': Categorical, 'language': Categorical} store_logical_types = {'id': Integer, u'région_id': Categorical} product_logical_types = { 'id': Categorical, 'rating': Double, 'department': Categorical, 'url': URL, } customer_logical_types = { 'id': Integer, 'age': Integer, u'région_id': Categorical, 'loves_ice_cream': Boolean, 'favorite_quote': NaturalLanguage, 'signup_date': Datetime(datetime_format='%Y-%m-%d'), 'upgrade_date': Datetime(datetime_format='%Y-%m-%d'), 'cancel_date': Datetime(datetime_format='%Y-%m-%d'), 'cancel_reason': Categorical, 'engagement_level': Ordinal(order=[1, 2, 3]), 'full_name': PersonFullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'date_of_birth': Datetime(datetime_format='%Y-%m-%d'), 'cohort_name': Categorical, } session_logical_types = { 'id': Integer, 'customer_id': Integer, 'device_type': Categorical, 'device_name': Categorical, 'ip': IPAddress, 'filepath': Filepath, } log_logical_types = { 'id': Integer, 'session_id': Integer, 'product_id': Categorical, 'datetime': Datetime(datetime_format='%Y-%m-%d'), 'value': Double, 'value_2': Double, 'latlong': LatLong, 'latlong2': LatLong, 'zipcode': PostalCode, 'countrycode': CountryCode, 'subregioncode': SubRegionCode, 'value_many_nans': Double, 'priority_level': Ordinal(order=[0, 1, 2]), 'purchased': Boolean, 'url': URL, 'email_address': EmailAddress, 'comments': NaturalLanguage } if with_integer_time_index: log_logical_types['datetime'] = Integer customer_logical_types['signup_date'] = Integer customer_logical_types['upgrade_date'] = Integer customer_logical_types['cancel_date'] = Integer customer_logical_types['date_of_birth'] = Integer return { 'customers': customer_logical_types, 'sessions': session_logical_types, 'log': log_logical_types, 'products': product_logical_types, 'stores': store_logical_types, u'régions': region_logical_types }
def make_logical_types(with_integer_time_index=False): region_logical_types = {"id": Categorical, "language": Categorical} store_logical_types = {"id": Integer, "région_id": Categorical} product_logical_types = { "id": Categorical, "rating": Double, "department": Categorical, "url": URL, } customer_logical_types = { "id": Integer, "age": Integer, "région_id": Categorical, "loves_ice_cream": Boolean, "favorite_quote": NaturalLanguage, "signup_date": Datetime(datetime_format="%Y-%m-%d"), "upgrade_date": Datetime(datetime_format="%Y-%m-%d"), "cancel_date": Datetime(datetime_format="%Y-%m-%d"), "cancel_reason": Categorical, "engagement_level": Ordinal(order=[1, 2, 3]), "full_name": PersonFullName, "email": EmailAddress, "phone_number": PhoneNumber, "birthday": Datetime(datetime_format="%Y-%m-%d"), "cohort_name": Categorical, } session_logical_types = { "id": Integer, "customer_id": Integer, "device_type": Categorical, "device_name": Categorical, "ip": IPAddress, "filepath": Filepath, } log_logical_types = { "id": Integer, "session_id": Integer, "product_id": Categorical, "datetime": Datetime(datetime_format="%Y-%m-%d"), "value": Double, "value_2": Double, "latlong": LatLong, "latlong2": LatLong, "zipcode": PostalCode, "countrycode": CountryCode, "subregioncode": SubRegionCode, "value_many_nans": Double, "priority_level": Ordinal(order=[0, 1, 2]), "purchased": Boolean, "url": URL, "email_address": EmailAddress, "comments": NaturalLanguage, } if with_integer_time_index: log_logical_types["datetime"] = Integer customer_logical_types["signup_date"] = Integer customer_logical_types["upgrade_date"] = Integer customer_logical_types["cancel_date"] = Integer customer_logical_types["birthday"] = Integer return { "customers": customer_logical_types, "sessions": session_logical_types, "log": log_logical_types, "products": product_logical_types, "stores": store_logical_types, "régions": region_logical_types, }
def test_parse_logical_type(): assert _parse_logical_type('Datetime', 'col_name') == Datetime assert _parse_logical_type(Datetime, 'col_name') == Datetime ymd_format = Datetime(datetime_format='%Y-%m-%d') assert _parse_logical_type(ymd_format, 'col_name') == ymd_format
def test_parse_logical_type(): assert isinstance(_parse_logical_type("Datetime", "col_name"), Datetime) assert isinstance(_parse_logical_type(Datetime, "col_name"), Datetime) ymd_format = Datetime(datetime_format="%Y-%m-%d") assert _parse_logical_type(ymd_format, "col_name") == ymd_format
def test_describe_accessor_method(describe_df): categorical_ltypes = [ Categorical, CountryCode, Ordinal(order=('yellow', 'red', 'blue')), PostalCode, SubRegionCode ] boolean_ltypes = [BooleanNullable] non_nullable_boolean_ltypes = [Boolean] datetime_ltypes = [Datetime] formatted_datetime_ltypes = [Datetime(datetime_format='%Y~%m~%d')] timedelta_ltypes = [Timedelta] nullable_numeric_ltypes = [Double, IntegerNullable] non_nullable_numeric_ltypes = [Integer] natural_language_ltypes = [ EmailAddress, Filepath, PersonFullName, IPAddress, PhoneNumber, URL ] latlong_ltypes = [LatLong] expected_index = [ 'physical_type', 'logical_type', 'semantic_tags', 'count', 'nunique', 'nan_count', 'mean', 'mode', 'std', 'min', 'first_quartile', 'second_quartile', 'third_quartile', 'max', 'num_true', 'num_false' ] # Test categorical columns category_data = describe_df[['category_col']] if ks and isinstance(category_data, ks.DataFrame): expected_dtype = 'string' else: expected_dtype = 'category' for ltype in categorical_ltypes: expected_vals = pd.Series( { 'physical_type': expected_dtype, 'logical_type': ltype, 'semantic_tags': {'category', 'custom_tag'}, 'count': 7, 'nunique': 3, 'nan_count': 1, 'mode': 'red' }, name='category_col') category_data.ww.init(logical_types={'category_col': ltype}, semantic_tags={'category_col': 'custom_tag'}) stats_df = category_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'category_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['category_col'].dropna()) # Test nullable boolean columns boolean_data = describe_df[['boolean_col']] for ltype in boolean_ltypes: expected_dtype = ltype.primary_dtype expected_vals = pd.Series( { 'physical_type': expected_dtype, 'logical_type': ltype, 'semantic_tags': {'custom_tag'}, 'count': 7, 'nan_count': 1, 'mode': True, 'num_true': 4, 'num_false': 3 }, name='boolean_col') boolean_data.ww.init(logical_types={'boolean_col': ltype}, semantic_tags={'boolean_col': 'custom_tag'}) stats_df = boolean_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'boolean_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['boolean_col'].dropna()) # Test non-nullable boolean columns boolean_data = describe_df[['boolean_col']].fillna(True) for ltype in non_nullable_boolean_ltypes: expected_dtype = ltype.primary_dtype expected_vals = pd.Series( { 'physical_type': expected_dtype, 'logical_type': ltype, 'semantic_tags': {'custom_tag'}, 'count': 8, 'nan_count': 0, 'mode': True, 'num_true': 5, 'num_false': 3 }, name='boolean_col') boolean_data.ww.init(logical_types={'boolean_col': ltype}, semantic_tags={'boolean_col': 'custom_tag'}) stats_df = boolean_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'boolean_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['boolean_col'].dropna()) # Test datetime columns datetime_data = describe_df[['datetime_col']] for ltype in datetime_ltypes: expected_vals = pd.Series( { 'physical_type': ltype.primary_dtype, 'logical_type': ltype, 'semantic_tags': {'custom_tag'}, 'count': 7, 'nunique': 6, 'nan_count': 1, 'mean': pd.Timestamp('2020-01-19 09:25:42.857142784'), 'mode': pd.Timestamp('2020-02-01 00:00:00'), 'min': pd.Timestamp('2020-01-01 00:00:00'), 'max': pd.Timestamp('2020-02-02 18:00:00') }, name='datetime_col') datetime_data.ww.init(logical_types={'datetime_col': ltype}, semantic_tags={'datetime_col': 'custom_tag'}) stats_df = datetime_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'datetime_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['datetime_col'].dropna()) # Test formatted datetime columns formatted_datetime_data = describe_df[['formatted_datetime_col']] for ltype in formatted_datetime_ltypes: converted_to_datetime = pd.to_datetime([ '2020-01-01', '2020-02-01', '2020-03-01', '2020-02-02', '2020-03-02', pd.NaT, '2020-02-01', '2020-01-02' ]) expected_vals = pd.Series( { 'physical_type': ltype.primary_dtype, 'logical_type': ltype, 'semantic_tags': {'custom_tag'}, 'count': 7, 'nunique': 6, 'nan_count': 1, 'mean': converted_to_datetime.mean(), 'mode': pd.to_datetime('2020-02-01'), 'min': converted_to_datetime.min(), 'max': converted_to_datetime.max() }, name='formatted_datetime_col') formatted_datetime_data.ww.init( logical_types={'formatted_datetime_col': ltype}, semantic_tags={'formatted_datetime_col': 'custom_tag'}) stats_df = formatted_datetime_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'formatted_datetime_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals( stats_df['formatted_datetime_col'].dropna()) # Test timedelta columns - Skip for Koalas if not (ks and isinstance(describe_df, ks.DataFrame)): timedelta_data = describe_df['timedelta_col'] for ltype in timedelta_ltypes: expected_vals = pd.Series( { 'physical_type': ltype.primary_dtype, 'logical_type': ltype, 'semantic_tags': {'custom_tag'}, 'count': 7, 'nan_count': 1, 'mode': pd.Timedelta('31days') }, name='col') df = pd.DataFrame({'col': timedelta_data}) df.ww.init(logical_types={'col': ltype}, semantic_tags={'col': 'custom_tag'}) stats_df = df.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['col'].dropna()) # Test numeric columns with nullable ltypes numeric_data = describe_df[['numeric_col']] for ltype in nullable_numeric_ltypes: expected_vals = pd.Series( { 'physical_type': ltype.primary_dtype, 'logical_type': ltype, 'semantic_tags': {'numeric', 'custom_tag'}, 'count': 7, 'nunique': 6, 'nan_count': 1, 'mean': 20.857142857142858, 'mode': 10, 'std': 18.27957486220227, 'min': 1, 'first_quartile': 10, 'second_quartile': 17, 'third_quartile': 26, 'max': 56 }, name='numeric_col') numeric_data.ww.init(logical_types={'numeric_col': ltype}, semantic_tags={'numeric_col': 'custom_tag'}) stats_df = numeric_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'numeric_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['numeric_col'].dropna()) # Test numeric with non-nullable ltypes numeric_data = describe_df[['numeric_col']].fillna(0) for ltype in non_nullable_numeric_ltypes: expected_vals = pd.Series( { 'physical_type': ltype.primary_dtype, 'logical_type': ltype, 'semantic_tags': {'numeric', 'custom_tag'}, 'count': 8, 'nunique': 7, 'nan_count': 0, 'mean': 18.25, 'mode': 10, 'std': 18.460382289804137, 'min': 0, 'first_quartile': 7.75, 'second_quartile': 13.5, 'third_quartile': 23, 'max': 56 }, name='numeric_col') numeric_data.ww.init(logical_types={'numeric_col': ltype}, semantic_tags={'numeric_col': 'custom_tag'}) stats_df = numeric_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'numeric_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['numeric_col'].dropna()) # Test natural language columns natural_language_data = describe_df[['natural_language_col']] expected_dtype = 'string' for ltype in natural_language_ltypes: expected_vals = pd.Series( { 'physical_type': expected_dtype, 'logical_type': ltype, 'semantic_tags': {'custom_tag'}, 'count': 7, 'nan_count': 1, 'mode': 'Duplicate sentence.' }, name='natural_language_col') natural_language_data.ww.init( logical_types={'natural_language_col': ltype}, semantic_tags={'natural_language_col': 'custom_tag'}) stats_df = natural_language_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'natural_language_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['natural_language_col'].dropna()) # Test latlong columns latlong_data = describe_df[['latlong_col']] expected_dtype = 'object' for ltype in latlong_ltypes: mode = [0, 0] if ks and isinstance(describe_df, ks.DataFrame) else (0, 0) expected_vals = pd.Series( { 'physical_type': expected_dtype, 'logical_type': ltype, 'semantic_tags': {'custom_tag'}, 'count': 6, 'nan_count': 2, 'mode': mode }, name='latlong_col') latlong_data.ww.init(logical_types={'latlong_col': ltype}, semantic_tags={'latlong_col': 'custom_tag'}) stats_df = latlong_data.ww.describe() assert isinstance(stats_df, pd.DataFrame) assert set(stats_df.columns) == {'latlong_col'} assert stats_df.index.tolist() == expected_index assert expected_vals.equals(stats_df['latlong_col'].dropna())