def test_select_ltypes_table(sample_df): dt = DataTable(sample_df, time_index='signup_date', index='id') dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) dt.set_types(semantic_tags={ 'full_name': ['new_tag', 'tag2'], 'age': 'numeric', }) dt_no_indices = dt.select('phone_number') assert dt_no_indices.index is None assert dt_no_indices.time_index is None dt_with_indices = dt.select(['Datetime', 'Integer']) assert dt_with_indices.index == 'id' assert dt_with_indices.time_index == 'signup_date' dt_values = dt.select(['FullName']) assert dt_values.name == dt.name original_col = dt_values.columns['full_name'] col = dt.columns['full_name'] assert col.logical_type == original_col.logical_type assert to_pandas(col.to_series()).equals(to_pandas(original_col.to_series())) assert col.dtype == original_col.dtype assert col.semantic_tags == original_col.semantic_tags
def test_invalid_dtype_casting(): column_name = 'test_series' # Cannot cast a column with pd.NA to Double series = pd.Series([1.1, pd.NA, 3], name=column_name) ltypes = { column_name: Double, } err_msg = 'Error converting datatype for column test_series from type object to type ' \ 'float64. Please confirm the underlying data is consistent with logical type Double.' with pytest.raises(TypeError, match=err_msg): DataTable(pd.DataFrame(series), logical_types=ltypes) # Cannot cast Datetime to Double series = pd.Series(['2020-01-01', '2020-01-02', '2020-01-03'], name=column_name) ltypes = { column_name: Datetime, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) err_msg = 'Error converting datatype for column test_series from type datetime64[ns] to type ' \ 'float64. Please confirm the underlying data is consistent with logical type Double.' with pytest.raises(TypeError, match=re.escape(err_msg)): dt.set_types(logical_types={column_name: Double}) # Cannot cast invalid strings to integers series = pd.Series(['1', 'two', '3'], name=column_name) ltypes = { column_name: Integer, } err_msg = 'Error converting datatype for column test_series from type object to type ' \ 'Int64. Please confirm the underlying data is consistent with logical type Integer.' with pytest.raises(TypeError, match=err_msg): DataTable(pd.DataFrame(series), logical_types=ltypes)
def test_select_semantic_tags_no_match(sample_df): dt = DataTable(sample_df, time_index='signup_date', index='id', name='dt_name') dt = dt.set_types( logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'signup_date': Datetime(datetime_format='%Y-%m-%d'), }) dt = dt.set_types( semantic_tags={ 'full_name': ['new_tag', 'tag2'], 'age': 'numeric', 'signup_date': 'date_of_birth', 'email': 'tag2' }) assert len(dt.select(['doesnt_exist']).columns) == 0 dt_multiple_unused = dt.select( ['doesnt_exist', 'boolean', 'category', PhoneNumber]) assert len(dt_multiple_unused.columns) == 2 dt_unused_ltype = dt.select( ['date_of_birth', 'doesnt_exist', ZIPCode, Integer]) assert len(dt_unused_ltype.columns) == 3
def test_set_logical_types_invalid_data(sample_df): dt = DataTable(sample_df) error_message = re.escape("logical_types contains columns that are not present in dataframe: ['birthday']") with pytest.raises(LookupError, match=error_message): dt.set_types(logical_types={'birthday': Double}) error_message = "Invalid logical type specified for 'age'" with pytest.raises(TypeError, match=error_message): dt.set_types(logical_types={'age': int})
def test_set_semantic_tags_with_index(sample_df): dt = DataTable(sample_df, index='id', use_standard_tags=False) assert dt.columns['id'].semantic_tags == {'index'} new_tags = { 'id': 'new_tag', } dt = dt.set_types(semantic_tags=new_tags) assert dt.columns['id'].semantic_tags == {'index', 'new_tag'} dt = dt.set_types(semantic_tags=new_tags, retain_index_tags=False) assert dt.columns['id'].semantic_tags == {'new_tag'}
def test_set_semantic_tags_with_time_index(sample_df): dt = DataTable(sample_df, time_index='signup_date', use_standard_tags=False) assert dt.columns['signup_date'].semantic_tags == {'time_index'} new_tags = { 'signup_date': 'new_tag', } dt = dt.set_types(semantic_tags=new_tags) assert dt.columns['signup_date'].semantic_tags == {'time_index', 'new_tag'} dt = dt.set_types(semantic_tags=new_tags, retain_index_tags=False) assert dt.columns['signup_date'].semantic_tags == {'new_tag'}
def test_sets_object_dtype_on_update(latlong_df): for column_name in latlong_df.columns: ltypes = {column_name: NaturalLanguage} dt = DataTable(latlong_df.loc[:, [column_name]], logical_types=ltypes) dt = dt.set_types(logical_types={column_name: LatLong}) assert dt.columns[column_name].logical_type == LatLong assert dt.columns[column_name].dtype == LatLong.pandas_dtype assert dt.to_dataframe()[column_name].dtype == LatLong.pandas_dtype
def test_sets_float64_dtype_on_update(): column_name = 'test_series' series = pd.Series([0, 1, 0], name=column_name) series = series.astype('object') ltypes = { column_name: Integer, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) dt = dt.set_types(logical_types={column_name: Double}) assert dt.columns[column_name].logical_type == Double assert dt.columns[column_name].dtype == Double.pandas_dtype assert dt.to_dataframe()[column_name].dtype == Double.pandas_dtype
def test_select_ltypes_mixed(sample_df): dt = DataTable(sample_df) dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) dt_mixed_ltypes = dt.select(['FullName', 'email_address', Double]) assert len(dt_mixed_ltypes.columns) == 3 assert 'phone_number' not in dt_mixed_ltypes.columns
def test_sets_datetime_dtype_on_update(): column_name = 'test_series' series = pd.Series(['2020-01-01', '2020-01-02', '2020-01-03'], name=column_name) series = series.astype('object') ltypes = { column_name: NaturalLanguage, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) dt = dt.set_types(logical_types={column_name: Datetime}) assert dt.columns[column_name].logical_type == Datetime assert dt.columns[column_name].dtype == Datetime.pandas_dtype assert dt.to_dataframe()[column_name].dtype == Datetime.pandas_dtype
def test_select_ltypes_no_match_and_all(sample_df): dt = DataTable(sample_df) dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) assert len(dt.select(ZIPCode).columns) == 0 assert len(dt.select(['ZIPCode', PhoneNumber]).columns) == 1 all_types = ww.type_system.registered_types dt_all_types = dt.select(all_types) assert len(dt_all_types.columns) == len(dt.columns) assert len(dt_all_types.to_dataframe().columns) == len(dt.to_dataframe().columns)
def test_sets_int64_dtype_on_update(): column_name = 'test_series' series = pd.Series([1.0, 2.0, 1.0], name=column_name) series = series.astype('object') logical_types = [Integer] for logical_type in logical_types: ltypes = { column_name: Double, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) dt = dt.set_types(logical_types={column_name: logical_type}) assert dt.columns[column_name].logical_type == logical_type assert dt.columns[column_name].dtype == logical_type.pandas_dtype assert dt.to_dataframe( )[column_name].dtype == logical_type.pandas_dtype
def test_select_ltypes_objects(sample_df): dt = DataTable(sample_df) dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) dt_multiple_ltypes = dt.select([FullName, EmailAddress, Double, Boolean, Datetime]) assert len(dt_multiple_ltypes.columns) == 5 assert 'phone_number' not in dt_multiple_ltypes.columns assert 'id' not in dt_multiple_ltypes.columns dt_single_ltype = dt.select(FullName) assert len(dt_single_ltype.columns) == 1
def test_underlying_index_on_update(sample_df): if dd and isinstance(sample_df, dd.DataFrame): pytest.xfail( 'Setting underlying index is not supported with Dask input') if ks and isinstance(sample_df, ks.DataFrame): pytest.xfail( 'Setting underlying index is not supported with Koalas input') dt = DataTable(sample_df.copy(), index='id') dt.update_dataframe(sample_df.tail(2)) assert (dt._dataframe.index == [2, 3]).all() assert dt._dataframe.index.name is None assert type(dt._dataframe.index) == pd.Int64Index assert type(dt.to_dataframe().index) == pd.Int64Index actual = dt.iloc[[0, 1]] assert type(actual._dataframe.index) == pd.Index assert type(actual.to_dataframe().index) == pd.Index actual = dt.select(dt.index) assert type(actual._dataframe.index) == pd.Int64Index assert type(actual.to_dataframe().index) == pd.Int64Index actual = dt[['age']] assert type(actual._dataframe.index) == pd.Int64Index assert type(actual.to_dataframe().index) == pd.Int64Index actual = dt.drop(dt.index) assert type(actual._dataframe.index) == pd.RangeIndex assert type(actual.to_dataframe().index) == pd.RangeIndex actual = dt.reset_semantic_tags(retain_index_tags=False) assert type(actual._dataframe.index) == pd.RangeIndex assert type(actual.to_dataframe().index) == pd.RangeIndex actual = dt.set_types(retain_index_tags=False, semantic_tags={'id': 'numeric'}) assert type(actual._dataframe.index) == pd.RangeIndex assert type(actual.to_dataframe().index) == pd.RangeIndex dt.pop(dt.index) assert type(dt._dataframe.index) == pd.RangeIndex assert type(dt.to_dataframe().index) == pd.RangeIndex
def test_set_semantic_tags(sample_df): semantic_tags = {'full_name': 'tag1', 'age': ['numeric', 'age']} expected_tags = {'full_name': {'tag1'}, 'age': {'numeric', 'age'}} dt = DataTable(sample_df, semantic_tags=semantic_tags) assert dt.columns['full_name'].semantic_tags == expected_tags['full_name'] assert dt.columns['age'].semantic_tags == expected_tags['age'] new_tags = { 'full_name': ['new_tag'], 'age': 'numeric', } new_dt = dt.set_types(semantic_tags=new_tags) # Verify original tags were not changed assert dt.columns['full_name'].semantic_tags == {'tag1'} assert dt.columns['age'].semantic_tags == {'numeric', 'age'} assert new_dt is not dt assert new_dt.columns['full_name'].semantic_tags == {'new_tag'} assert new_dt.columns['age'].semantic_tags == {'numeric'}
def test_set_logical_types(sample_df): semantic_tags = { 'full_name': 'tag1', 'email': ['tag2'], 'phone_number': ['tag3', 'tag2'], 'signup_date': {'secondary_time_index'}, } dt = DataTable(sample_df, semantic_tags=semantic_tags, use_standard_tags=True) new_dt = dt.set_types(logical_types={ 'full_name': Categorical, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, }) # Verify original types were not changed assert dt.columns['full_name'].logical_type == NaturalLanguage assert dt.columns['email'].logical_type == NaturalLanguage assert dt.columns['phone_number'].logical_type == NaturalLanguage assert dt.columns['age'].logical_type == Integer assert dt.columns['signup_date'].logical_type == Datetime original_name_column = dt.columns['full_name'] assert new_dt is not dt assert new_dt.columns['full_name'].logical_type == Categorical assert new_dt.columns['email'].logical_type == EmailAddress assert new_dt.columns['phone_number'].logical_type == PhoneNumber assert new_dt.columns['age'].logical_type == Double # Verify new column object was created new_name_column = new_dt.columns['full_name'] assert new_name_column is not original_name_column # Verify semantic tags were reset to standard tags assert new_dt.columns['full_name'].semantic_tags == {'category'} assert new_dt.columns['email'].semantic_tags == set() assert new_dt.columns['phone_number'].semantic_tags == set() assert new_dt.columns['age'].semantic_tags == {'numeric'} # Verify signup date column was unchanged assert new_dt.columns['signup_date'].logical_type == Datetime assert new_dt.columns['signup_date'].semantic_tags == {'secondary_time_index'}
def test_select_semantic_tags(sample_df): dt = DataTable(sample_df, time_index='signup_date', name='dt_name') dt = dt.set_types( semantic_tags={ 'full_name': 'tag1', 'email': ['tag2'], 'age': ['numeric', 'tag2'], 'phone_number': ['tag3', 'tag2'], 'is_registered': 'category', }) dt_one_match = dt.select('numeric') assert len(dt_one_match.columns) == 2 assert 'age' in dt_one_match.columns assert 'id' in dt_one_match.columns dt_multiple_matches = dt.select('tag2') assert len(dt_multiple_matches.columns) == 3 assert 'age' in dt_multiple_matches.columns assert 'phone_number' in dt_multiple_matches.columns assert 'email' in dt_multiple_matches.columns dt_multiple_tags = dt.select(['numeric', 'time_index']) assert len(dt_multiple_tags.columns) == 3 assert 'id' in dt_multiple_tags.columns assert 'age' in dt_multiple_tags.columns assert 'signup_date' in dt_multiple_tags.columns dt_overlapping_tags = dt.select(['numeric', 'tag2']) assert len(dt_overlapping_tags.columns) == 4 assert 'id' in dt_overlapping_tags.columns assert 'age' in dt_overlapping_tags.columns assert 'phone_number' in dt_overlapping_tags.columns assert 'email' in dt_overlapping_tags.columns dt_common_tags = dt.select(['category', 'numeric']) assert len(dt_common_tags.columns) == 3 assert 'id' in dt_common_tags.columns assert 'is_registered' in dt_common_tags.columns assert 'age' in dt_common_tags.columns
def test_sets_category_dtype_on_update(): column_name = 'test_series' series = pd.Series(['a', 'b', 'c'], name=column_name) series = series.astype('object') logical_types = [ Categorical, CountryCode, Ordinal(order=['a', 'b', 'c']), SubRegionCode, ZIPCode, ] for logical_type in logical_types: ltypes = { column_name: NaturalLanguage, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) dt = dt.set_types(logical_types={column_name: logical_type}) assert dt.columns[column_name].logical_type == logical_type assert dt.columns[column_name].dtype == logical_type.pandas_dtype assert dt.to_dataframe( )[column_name].dtype == logical_type.pandas_dtype
def test_sets_string_dtype_on_update(): column_name = 'test_series' series = pd.Series(['a', 'b', 'c'], name=column_name) series = series.astype('object') logical_types = [ Filepath, FullName, IPAddress, NaturalLanguage, PhoneNumber, URL, ] for logical_type in logical_types: ltypes = { column_name: Categorical, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) dt = dt.set_types(logical_types={column_name: logical_type}) assert dt.columns[column_name].logical_type == logical_type assert dt.columns[column_name].dtype == logical_type.pandas_dtype assert dt.to_dataframe( )[column_name].dtype == logical_type.pandas_dtype