def test_select_semantic_tags_no_match(sample_df):
    dt = DataTable(sample_df,
                   time_index='signup_date',
                   index='id',
                   name='dt_name')
    dt = dt.set_types(
        logical_types={
            'full_name': FullName,
            'email': EmailAddress,
            'phone_number': PhoneNumber,
            'signup_date': Datetime(datetime_format='%Y-%m-%d'),
        })
    dt = dt.set_types(
        semantic_tags={
            'full_name': ['new_tag', 'tag2'],
            'age': 'numeric',
            'signup_date': 'date_of_birth',
            'email': 'tag2'
        })

    assert len(dt.select(['doesnt_exist']).columns) == 0

    dt_multiple_unused = dt.select(
        ['doesnt_exist', 'boolean', 'category', PhoneNumber])
    assert len(dt_multiple_unused.columns) == 2

    dt_unused_ltype = dt.select(
        ['date_of_birth', 'doesnt_exist', ZIPCode, Integer])
    assert len(dt_unused_ltype.columns) == 3
def test_select_ltypes_table(sample_df):
    dt = DataTable(sample_df, time_index='signup_date', index='id')
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })
    dt.set_types(semantic_tags={
        'full_name': ['new_tag', 'tag2'],
        'age': 'numeric',
    })

    dt_no_indices = dt.select('phone_number')
    assert dt_no_indices.index is None
    assert dt_no_indices.time_index is None

    dt_with_indices = dt.select(['Datetime', 'Integer'])
    assert dt_with_indices.index == 'id'
    assert dt_with_indices.time_index == 'signup_date'

    dt_values = dt.select(['FullName'])
    assert dt_values.name == dt.name
    original_col = dt_values.columns['full_name']
    col = dt.columns['full_name']
    assert col.logical_type == original_col.logical_type
    assert to_pandas(col.to_series()).equals(to_pandas(original_col.to_series()))
    assert col.dtype == original_col.dtype
    assert col.semantic_tags == original_col.semantic_tags
def test_select_ltypes_no_match_and_all(sample_df):
    dt = DataTable(sample_df)
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })
    assert len(dt.select(ZIPCode).columns) == 0
    assert len(dt.select(['ZIPCode', PhoneNumber]).columns) == 1
    all_types = ww.type_system.registered_types
    dt_all_types = dt.select(all_types)
    assert len(dt_all_types.columns) == len(dt.columns)
    assert len(dt_all_types.to_dataframe().columns) == len(dt.to_dataframe().columns)
def test_select_ltypes_objects(sample_df):
    dt = DataTable(sample_df)
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })

    dt_multiple_ltypes = dt.select([FullName, EmailAddress, Double, Boolean, Datetime])
    assert len(dt_multiple_ltypes.columns) == 5
    assert 'phone_number' not in dt_multiple_ltypes.columns
    assert 'id' not in dt_multiple_ltypes.columns

    dt_single_ltype = dt.select(FullName)
    assert len(dt_single_ltype.columns) == 1
def test_select_semantic_tags(sample_df):
    dt = DataTable(sample_df, time_index='signup_date', name='dt_name')
    dt = dt.set_types(
        semantic_tags={
            'full_name': 'tag1',
            'email': ['tag2'],
            'age': ['numeric', 'tag2'],
            'phone_number': ['tag3', 'tag2'],
            'is_registered': 'category',
        })

    dt_one_match = dt.select('numeric')
    assert len(dt_one_match.columns) == 2
    assert 'age' in dt_one_match.columns
    assert 'id' in dt_one_match.columns

    dt_multiple_matches = dt.select('tag2')
    assert len(dt_multiple_matches.columns) == 3
    assert 'age' in dt_multiple_matches.columns
    assert 'phone_number' in dt_multiple_matches.columns
    assert 'email' in dt_multiple_matches.columns

    dt_multiple_tags = dt.select(['numeric', 'time_index'])
    assert len(dt_multiple_tags.columns) == 3
    assert 'id' in dt_multiple_tags.columns
    assert 'age' in dt_multiple_tags.columns
    assert 'signup_date' in dt_multiple_tags.columns

    dt_overlapping_tags = dt.select(['numeric', 'tag2'])
    assert len(dt_overlapping_tags.columns) == 4
    assert 'id' in dt_overlapping_tags.columns
    assert 'age' in dt_overlapping_tags.columns
    assert 'phone_number' in dt_overlapping_tags.columns
    assert 'email' in dt_overlapping_tags.columns

    dt_common_tags = dt.select(['category', 'numeric'])
    assert len(dt_common_tags.columns) == 3
    assert 'id' in dt_common_tags.columns
    assert 'is_registered' in dt_common_tags.columns
    assert 'age' in dt_common_tags.columns
def test_select_ltypes_mixed(sample_df):
    dt = DataTable(sample_df)
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })

    dt_mixed_ltypes = dt.select(['FullName', 'email_address', Double])
    assert len(dt_mixed_ltypes.columns) == 3
    assert 'phone_number' not in dt_mixed_ltypes.columns
Ejemplo n.º 7
0
def test_underlying_index_on_update(sample_df):
    if dd and isinstance(sample_df, dd.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Dask input')
    if ks and isinstance(sample_df, ks.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Koalas input')

    dt = DataTable(sample_df.copy(), index='id')

    dt.update_dataframe(sample_df.tail(2))
    assert (dt._dataframe.index == [2, 3]).all()
    assert dt._dataframe.index.name is None
    assert type(dt._dataframe.index) == pd.Int64Index
    assert type(dt.to_dataframe().index) == pd.Int64Index

    actual = dt.iloc[[0, 1]]
    assert type(actual._dataframe.index) == pd.Index
    assert type(actual.to_dataframe().index) == pd.Index

    actual = dt.select(dt.index)
    assert type(actual._dataframe.index) == pd.Int64Index
    assert type(actual.to_dataframe().index) == pd.Int64Index

    actual = dt[['age']]
    assert type(actual._dataframe.index) == pd.Int64Index
    assert type(actual.to_dataframe().index) == pd.Int64Index

    actual = dt.drop(dt.index)
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    actual = dt.reset_semantic_tags(retain_index_tags=False)
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    actual = dt.set_types(retain_index_tags=False,
                          semantic_tags={'id': 'numeric'})
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    dt.pop(dt.index)
    assert type(dt._dataframe.index) == pd.RangeIndex
    assert type(dt.to_dataframe().index) == pd.RangeIndex