def test_select_ltypes_table(sample_df):
    dt = DataTable(sample_df, time_index='signup_date', index='id')
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })
    dt.set_types(semantic_tags={
        'full_name': ['new_tag', 'tag2'],
        'age': 'numeric',
    })

    dt_no_indices = dt.select('phone_number')
    assert dt_no_indices.index is None
    assert dt_no_indices.time_index is None

    dt_with_indices = dt.select(['Datetime', 'Integer'])
    assert dt_with_indices.index == 'id'
    assert dt_with_indices.time_index == 'signup_date'

    dt_values = dt.select(['FullName'])
    assert dt_values.name == dt.name
    original_col = dt_values.columns['full_name']
    col = dt.columns['full_name']
    assert col.logical_type == original_col.logical_type
    assert to_pandas(col.to_series()).equals(to_pandas(original_col.to_series()))
    assert col.dtype == original_col.dtype
    assert col.semantic_tags == original_col.semantic_tags
def test_invalid_dtype_casting():
    column_name = 'test_series'

    # Cannot cast a column with pd.NA to Double
    series = pd.Series([1.1, pd.NA, 3], name=column_name)
    ltypes = {
        column_name: Double,
    }
    err_msg = 'Error converting datatype for column test_series from type object to type ' \
        'float64. Please confirm the underlying data is consistent with logical type Double.'
    with pytest.raises(TypeError, match=err_msg):
        DataTable(pd.DataFrame(series), logical_types=ltypes)

    # Cannot cast Datetime to Double
    series = pd.Series(['2020-01-01', '2020-01-02', '2020-01-03'],
                       name=column_name)
    ltypes = {
        column_name: Datetime,
    }
    dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
    err_msg = 'Error converting datatype for column test_series from type datetime64[ns] to type ' \
        'float64. Please confirm the underlying data is consistent with logical type Double.'
    with pytest.raises(TypeError, match=re.escape(err_msg)):
        dt.set_types(logical_types={column_name: Double})

    # Cannot cast invalid strings to integers
    series = pd.Series(['1', 'two', '3'], name=column_name)
    ltypes = {
        column_name: Integer,
    }
    err_msg = 'Error converting datatype for column test_series from type object to type ' \
        'Int64. Please confirm the underlying data is consistent with logical type Integer.'
    with pytest.raises(TypeError, match=err_msg):
        DataTable(pd.DataFrame(series), logical_types=ltypes)
def test_select_semantic_tags_no_match(sample_df):
    dt = DataTable(sample_df,
                   time_index='signup_date',
                   index='id',
                   name='dt_name')
    dt = dt.set_types(
        logical_types={
            'full_name': FullName,
            'email': EmailAddress,
            'phone_number': PhoneNumber,
            'signup_date': Datetime(datetime_format='%Y-%m-%d'),
        })
    dt = dt.set_types(
        semantic_tags={
            'full_name': ['new_tag', 'tag2'],
            'age': 'numeric',
            'signup_date': 'date_of_birth',
            'email': 'tag2'
        })

    assert len(dt.select(['doesnt_exist']).columns) == 0

    dt_multiple_unused = dt.select(
        ['doesnt_exist', 'boolean', 'category', PhoneNumber])
    assert len(dt_multiple_unused.columns) == 2

    dt_unused_ltype = dt.select(
        ['date_of_birth', 'doesnt_exist', ZIPCode, Integer])
    assert len(dt_unused_ltype.columns) == 3
def test_set_logical_types_invalid_data(sample_df):
    dt = DataTable(sample_df)
    error_message = re.escape("logical_types contains columns that are not present in dataframe: ['birthday']")
    with pytest.raises(LookupError, match=error_message):
        dt.set_types(logical_types={'birthday': Double})

    error_message = "Invalid logical type specified for 'age'"
    with pytest.raises(TypeError, match=error_message):
        dt.set_types(logical_types={'age': int})
def test_set_semantic_tags_with_index(sample_df):
    dt = DataTable(sample_df, index='id', use_standard_tags=False)
    assert dt.columns['id'].semantic_tags == {'index'}

    new_tags = {
        'id': 'new_tag',
    }
    dt = dt.set_types(semantic_tags=new_tags)
    assert dt.columns['id'].semantic_tags == {'index', 'new_tag'}
    dt = dt.set_types(semantic_tags=new_tags, retain_index_tags=False)
    assert dt.columns['id'].semantic_tags == {'new_tag'}
def test_set_semantic_tags_with_time_index(sample_df):
    dt = DataTable(sample_df,
                   time_index='signup_date',
                   use_standard_tags=False)
    assert dt.columns['signup_date'].semantic_tags == {'time_index'}

    new_tags = {
        'signup_date': 'new_tag',
    }
    dt = dt.set_types(semantic_tags=new_tags)
    assert dt.columns['signup_date'].semantic_tags == {'time_index', 'new_tag'}
    dt = dt.set_types(semantic_tags=new_tags, retain_index_tags=False)
    assert dt.columns['signup_date'].semantic_tags == {'new_tag'}
def test_sets_object_dtype_on_update(latlong_df):
    for column_name in latlong_df.columns:
        ltypes = {column_name: NaturalLanguage}
        dt = DataTable(latlong_df.loc[:, [column_name]], logical_types=ltypes)
        dt = dt.set_types(logical_types={column_name: LatLong})
        assert dt.columns[column_name].logical_type == LatLong
        assert dt.columns[column_name].dtype == LatLong.pandas_dtype
        assert dt.to_dataframe()[column_name].dtype == LatLong.pandas_dtype
def test_sets_float64_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series([0, 1, 0], name=column_name)
    series = series.astype('object')
    ltypes = {
        column_name: Integer,
    }
    dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
    dt = dt.set_types(logical_types={column_name: Double})
    assert dt.columns[column_name].logical_type == Double
    assert dt.columns[column_name].dtype == Double.pandas_dtype
    assert dt.to_dataframe()[column_name].dtype == Double.pandas_dtype
def test_select_ltypes_mixed(sample_df):
    dt = DataTable(sample_df)
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })

    dt_mixed_ltypes = dt.select(['FullName', 'email_address', Double])
    assert len(dt_mixed_ltypes.columns) == 3
    assert 'phone_number' not in dt_mixed_ltypes.columns
def test_sets_datetime_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series(['2020-01-01', '2020-01-02', '2020-01-03'],
                       name=column_name)
    series = series.astype('object')
    ltypes = {
        column_name: NaturalLanguage,
    }
    dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
    dt = dt.set_types(logical_types={column_name: Datetime})
    assert dt.columns[column_name].logical_type == Datetime
    assert dt.columns[column_name].dtype == Datetime.pandas_dtype
    assert dt.to_dataframe()[column_name].dtype == Datetime.pandas_dtype
def test_select_ltypes_no_match_and_all(sample_df):
    dt = DataTable(sample_df)
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })
    assert len(dt.select(ZIPCode).columns) == 0
    assert len(dt.select(['ZIPCode', PhoneNumber]).columns) == 1
    all_types = ww.type_system.registered_types
    dt_all_types = dt.select(all_types)
    assert len(dt_all_types.columns) == len(dt.columns)
    assert len(dt_all_types.to_dataframe().columns) == len(dt.to_dataframe().columns)
def test_sets_int64_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series([1.0, 2.0, 1.0], name=column_name)
    series = series.astype('object')
    logical_types = [Integer]

    for logical_type in logical_types:
        ltypes = {
            column_name: Double,
        }
        dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
        dt = dt.set_types(logical_types={column_name: logical_type})
        assert dt.columns[column_name].logical_type == logical_type
        assert dt.columns[column_name].dtype == logical_type.pandas_dtype
        assert dt.to_dataframe(
        )[column_name].dtype == logical_type.pandas_dtype
def test_select_ltypes_objects(sample_df):
    dt = DataTable(sample_df)
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })

    dt_multiple_ltypes = dt.select([FullName, EmailAddress, Double, Boolean, Datetime])
    assert len(dt_multiple_ltypes.columns) == 5
    assert 'phone_number' not in dt_multiple_ltypes.columns
    assert 'id' not in dt_multiple_ltypes.columns

    dt_single_ltype = dt.select(FullName)
    assert len(dt_single_ltype.columns) == 1
Exemple #14
0
def test_underlying_index_on_update(sample_df):
    if dd and isinstance(sample_df, dd.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Dask input')
    if ks and isinstance(sample_df, ks.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Koalas input')

    dt = DataTable(sample_df.copy(), index='id')

    dt.update_dataframe(sample_df.tail(2))
    assert (dt._dataframe.index == [2, 3]).all()
    assert dt._dataframe.index.name is None
    assert type(dt._dataframe.index) == pd.Int64Index
    assert type(dt.to_dataframe().index) == pd.Int64Index

    actual = dt.iloc[[0, 1]]
    assert type(actual._dataframe.index) == pd.Index
    assert type(actual.to_dataframe().index) == pd.Index

    actual = dt.select(dt.index)
    assert type(actual._dataframe.index) == pd.Int64Index
    assert type(actual.to_dataframe().index) == pd.Int64Index

    actual = dt[['age']]
    assert type(actual._dataframe.index) == pd.Int64Index
    assert type(actual.to_dataframe().index) == pd.Int64Index

    actual = dt.drop(dt.index)
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    actual = dt.reset_semantic_tags(retain_index_tags=False)
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    actual = dt.set_types(retain_index_tags=False,
                          semantic_tags={'id': 'numeric'})
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    dt.pop(dt.index)
    assert type(dt._dataframe.index) == pd.RangeIndex
    assert type(dt.to_dataframe().index) == pd.RangeIndex
def test_set_semantic_tags(sample_df):
    semantic_tags = {'full_name': 'tag1', 'age': ['numeric', 'age']}
    expected_tags = {'full_name': {'tag1'}, 'age': {'numeric', 'age'}}
    dt = DataTable(sample_df, semantic_tags=semantic_tags)
    assert dt.columns['full_name'].semantic_tags == expected_tags['full_name']
    assert dt.columns['age'].semantic_tags == expected_tags['age']

    new_tags = {
        'full_name': ['new_tag'],
        'age': 'numeric',
    }
    new_dt = dt.set_types(semantic_tags=new_tags)
    # Verify original tags were not changed
    assert dt.columns['full_name'].semantic_tags == {'tag1'}
    assert dt.columns['age'].semantic_tags == {'numeric', 'age'}

    assert new_dt is not dt
    assert new_dt.columns['full_name'].semantic_tags == {'new_tag'}
    assert new_dt.columns['age'].semantic_tags == {'numeric'}
def test_set_logical_types(sample_df):
    semantic_tags = {
        'full_name': 'tag1',
        'email': ['tag2'],
        'phone_number': ['tag3', 'tag2'],
        'signup_date': {'secondary_time_index'},
    }
    dt = DataTable(sample_df, semantic_tags=semantic_tags, use_standard_tags=True)

    new_dt = dt.set_types(logical_types={
        'full_name': Categorical,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
    })

    # Verify original types were not changed
    assert dt.columns['full_name'].logical_type == NaturalLanguage
    assert dt.columns['email'].logical_type == NaturalLanguage
    assert dt.columns['phone_number'].logical_type == NaturalLanguage
    assert dt.columns['age'].logical_type == Integer
    assert dt.columns['signup_date'].logical_type == Datetime
    original_name_column = dt.columns['full_name']

    assert new_dt is not dt
    assert new_dt.columns['full_name'].logical_type == Categorical
    assert new_dt.columns['email'].logical_type == EmailAddress
    assert new_dt.columns['phone_number'].logical_type == PhoneNumber
    assert new_dt.columns['age'].logical_type == Double

    # Verify new column object was created
    new_name_column = new_dt.columns['full_name']
    assert new_name_column is not original_name_column

    # Verify semantic tags were reset to standard tags
    assert new_dt.columns['full_name'].semantic_tags == {'category'}
    assert new_dt.columns['email'].semantic_tags == set()
    assert new_dt.columns['phone_number'].semantic_tags == set()
    assert new_dt.columns['age'].semantic_tags == {'numeric'}

    # Verify signup date column was unchanged
    assert new_dt.columns['signup_date'].logical_type == Datetime
    assert new_dt.columns['signup_date'].semantic_tags == {'secondary_time_index'}
def test_select_semantic_tags(sample_df):
    dt = DataTable(sample_df, time_index='signup_date', name='dt_name')
    dt = dt.set_types(
        semantic_tags={
            'full_name': 'tag1',
            'email': ['tag2'],
            'age': ['numeric', 'tag2'],
            'phone_number': ['tag3', 'tag2'],
            'is_registered': 'category',
        })

    dt_one_match = dt.select('numeric')
    assert len(dt_one_match.columns) == 2
    assert 'age' in dt_one_match.columns
    assert 'id' in dt_one_match.columns

    dt_multiple_matches = dt.select('tag2')
    assert len(dt_multiple_matches.columns) == 3
    assert 'age' in dt_multiple_matches.columns
    assert 'phone_number' in dt_multiple_matches.columns
    assert 'email' in dt_multiple_matches.columns

    dt_multiple_tags = dt.select(['numeric', 'time_index'])
    assert len(dt_multiple_tags.columns) == 3
    assert 'id' in dt_multiple_tags.columns
    assert 'age' in dt_multiple_tags.columns
    assert 'signup_date' in dt_multiple_tags.columns

    dt_overlapping_tags = dt.select(['numeric', 'tag2'])
    assert len(dt_overlapping_tags.columns) == 4
    assert 'id' in dt_overlapping_tags.columns
    assert 'age' in dt_overlapping_tags.columns
    assert 'phone_number' in dt_overlapping_tags.columns
    assert 'email' in dt_overlapping_tags.columns

    dt_common_tags = dt.select(['category', 'numeric'])
    assert len(dt_common_tags.columns) == 3
    assert 'id' in dt_common_tags.columns
    assert 'is_registered' in dt_common_tags.columns
    assert 'age' in dt_common_tags.columns
def test_sets_category_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series(['a', 'b', 'c'], name=column_name)
    series = series.astype('object')
    logical_types = [
        Categorical,
        CountryCode,
        Ordinal(order=['a', 'b', 'c']),
        SubRegionCode,
        ZIPCode,
    ]

    for logical_type in logical_types:
        ltypes = {
            column_name: NaturalLanguage,
        }
        dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
        dt = dt.set_types(logical_types={column_name: logical_type})
        assert dt.columns[column_name].logical_type == logical_type
        assert dt.columns[column_name].dtype == logical_type.pandas_dtype
        assert dt.to_dataframe(
        )[column_name].dtype == logical_type.pandas_dtype
def test_sets_string_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series(['a', 'b', 'c'], name=column_name)
    series = series.astype('object')
    logical_types = [
        Filepath,
        FullName,
        IPAddress,
        NaturalLanguage,
        PhoneNumber,
        URL,
    ]

    for logical_type in logical_types:
        ltypes = {
            column_name: Categorical,
        }
        dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
        dt = dt.set_types(logical_types={column_name: logical_type})
        assert dt.columns[column_name].logical_type == logical_type
        assert dt.columns[column_name].dtype == logical_type.pandas_dtype
        assert dt.to_dataframe(
        )[column_name].dtype == logical_type.pandas_dtype