def test_set_logical_types(sample_column_names, sample_inferred_logical_types):
    semantic_tags = {
        'full_name': 'tag1',
        'email': ['tag2'],
        'phone_number': ['tag3', 'tag2'],
        'signup_date': {'secondary_time_index'},
    }
    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         semantic_tags=semantic_tags,
                         use_standard_tags=True)

    schema.set_types(
        logical_types={
            'full_name': Categorical,
            'email': EmailAddress,
            'phone_number': PhoneNumber,
            'age': Double,
        })

    assert schema.logical_types['full_name'] == Categorical
    assert schema.logical_types['email'] == EmailAddress
    assert schema.logical_types['phone_number'] == PhoneNumber
    assert schema.logical_types['age'] == Double

    # Verify semantic tags were reset to standard tags in columns with Logical Type changes
    assert schema.semantic_tags['full_name'] == {'category'}
    assert schema.semantic_tags['email'] == set()
    assert schema.semantic_tags['phone_number'] == set()
    assert schema.semantic_tags['age'] == {'numeric'}

    # Verify signup date column was unchanged
    assert schema.logical_types['signup_date'] == Datetime
    assert schema.semantic_tags['signup_date'] == {'secondary_time_index'}
Beispiel #2
0
def test_schema_with_numeric_time_index(sample_column_names,
                                        sample_inferred_logical_types):
    # Set a numeric time index on init
    schema = TableSchema(sample_column_names,
                         logical_types={
                             **sample_inferred_logical_types,
                             **{
                                 'signup_date': Integer
                             }
                         },
                         time_index='signup_date',
                         use_standard_tags=True)
    date_col = schema.columns['signup_date']
    assert schema.time_index == 'signup_date'
    assert date_col.logical_type == Integer
    assert date_col.semantic_tags == {'time_index', 'numeric'}

    # Specify logical type for time index on init
    schema = TableSchema(sample_column_names,
                         logical_types={
                             **sample_inferred_logical_types,
                             **{
                                 'signup_date': Double
                             }
                         },
                         time_index='signup_date',
                         use_standard_tags=True)
    date_col = schema.columns['signup_date']
    assert schema.time_index == 'signup_date'
    assert date_col.logical_type == Double
    assert date_col.semantic_tags == {'time_index', 'numeric'}
def test_schema_rename(sample_column_names, sample_inferred_logical_types):

    table_metadata = {'table_info': 'this is text'}
    id_description = 'the id of the row'
    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         index='id',
                         time_index='signup_date',
                         table_metadata=table_metadata,
                         column_descriptions={'id': id_description})
    original_schema = schema._get_subset_schema(list(schema.columns.keys()))

    renamed_schema = schema.rename({'age': 'birthday'})

    # Confirm original schema hasn't changed
    assert schema == original_schema

    assert 'age' not in renamed_schema.columns
    assert 'birthday' in renamed_schema.columns

    # confirm that metadata and descriptions are there
    assert renamed_schema.metadata == table_metadata
    assert schema.columns['id'].description == id_description

    old_col = schema.columns['age']
    new_col = renamed_schema.columns['birthday']
    assert old_col.logical_type == new_col.logical_type
    assert old_col.semantic_tags == new_col.semantic_tags

    swapped_schema = schema.rename({'age': 'full_name', 'full_name': 'age'})
    swapped_back_schema = swapped_schema.rename({
        'age': 'full_name',
        'full_name': 'age'
    })
    assert swapped_back_schema == schema
Beispiel #4
0
def test_validation_methods_called(mock_validate_params, mock_check_index,
                                   mock_check_time_index,
                                   mock_validate_not_setting_index,
                                   sample_column_names,
                                   sample_inferred_logical_types):
    assert not mock_validate_params.called
    assert not mock_check_index.called
    assert not mock_check_time_index.called
    assert not mock_validate_not_setting_index.called

    not_validated_schema = TableSchema(sample_column_names,
                                       sample_inferred_logical_types,
                                       index='id',
                                       time_index='signup_date',
                                       validate=False)
    assert not mock_validate_params.called
    assert not mock_check_index.called
    assert not mock_check_time_index.called
    assert not mock_validate_not_setting_index.called

    validated_schema = TableSchema(sample_column_names,
                                   sample_inferred_logical_types,
                                   index='id',
                                   time_index='signup_date',
                                   validate=True)
    assert mock_validate_params.called
    assert mock_check_index.called
    assert mock_check_time_index.called
    assert mock_validate_not_setting_index.called

    assert validated_schema == not_validated_schema
def test_set_index_errors(sample_column_names, sample_inferred_logical_types):
    schema = TableSchema(sample_column_names, sample_inferred_logical_types)

    error = re.escape(
        "Specified index column `testing` not found in TableSchema.")
    with pytest.raises(LookupError, match=error):
        schema.set_index('testing')
def test_semantic_tag_errors(sample_column_names,
                             sample_inferred_logical_types):
    error_message = "semantic_tags for id must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        TableSchema(
            sample_column_names,
            sample_inferred_logical_types,
            semantic_tags={"id": int},
        )

    error_message = "semantic_tags for id must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        TableSchema(
            sample_column_names,
            sample_inferred_logical_types,
            semantic_tags={"id": {
                "index": {},
                "time_index": {}
            }},
        )

    error_message = "semantic_tags for id must contain only strings"
    with pytest.raises(TypeError, match=error_message):
        TableSchema(
            sample_column_names,
            sample_inferred_logical_types,
            semantic_tags={"id": ["index", 1]},
        )
def test_schema_adds_standard_semantic_tags(sample_column_names,
                                            sample_inferred_logical_types):
    schema = TableSchema(
        sample_column_names,
        logical_types={
            **sample_inferred_logical_types,
            **{
                "id": Categorical
            }
        },
        use_standard_tags=True,
        name="schema",
    )

    assert schema.semantic_tags["id"] == {"category"}
    assert schema.semantic_tags["age"] == {"numeric"}

    schema = TableSchema(
        sample_column_names,
        logical_types={
            **sample_inferred_logical_types,
            **{
                "id": Categorical
            }
        },
        name="schema",
        use_standard_tags=False,
    )

    assert schema.semantic_tags["id"] == set()
    assert schema.semantic_tags["age"] == set()
def test_schema_with_numeric_time_index(sample_column_names,
                                        sample_inferred_logical_types):
    # Set a numeric time index on init
    schema = TableSchema(
        sample_column_names,
        logical_types={
            **sample_inferred_logical_types,
            **{
                "signup_date": Integer
            }
        },
        time_index="signup_date",
        use_standard_tags=True,
    )
    date_col = schema.columns["signup_date"]
    assert schema.time_index == "signup_date"
    assert isinstance(date_col.logical_type, Integer)
    assert date_col.semantic_tags == {"time_index", "numeric"}

    # Specify logical type for time index on init
    schema = TableSchema(
        sample_column_names,
        logical_types={
            **sample_inferred_logical_types,
            **{
                "signup_date": Double
            }
        },
        time_index="signup_date",
        use_standard_tags=True,
    )
    date_col = schema.columns["signup_date"]
    assert schema.time_index == "signup_date"
    assert isinstance(date_col.logical_type, Double)
    assert date_col.semantic_tags == {"time_index", "numeric"}
Beispiel #9
0
def test_set_logical_types_invalid_data(sample_column_names,
                                        sample_inferred_logical_types):
    schema = TableSchema(sample_column_names, sample_inferred_logical_types)

    error_message = re.escape(
        "logical_types contains columns that are not present in TableSchema: ['birthday']"
    )
    with pytest.raises(ColumnNotPresentError, match=error_message):
        schema.set_types(logical_types={"birthday": Double})

    error_message = ("Logical Types must be of the LogicalType class "
                     "and registered in Woodwork's type system. "
                     "Double does not meet that criteria.")
    with pytest.raises(TypeError, match=error_message):
        schema.set_types(logical_types={"id": "Double"})

    error_message = ("Logical Types must be of the LogicalType class "
                     "and registered in Woodwork's type system. "
                     "<class 'int'> does not meet that criteria.")
    with pytest.raises(TypeError, match=error_message):
        schema.set_types(logical_types={"age": int})

    error_message = "semantic_tags for full_name must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        schema.set_types(semantic_tags={"full_name": None})
Beispiel #10
0
def test_schema_adds_standard_semantic_tags(sample_column_names,
                                            sample_inferred_logical_types):
    schema = TableSchema(sample_column_names,
                         logical_types={
                             **sample_inferred_logical_types,
                             **{
                                 'id': Categorical
                             }
                         },
                         use_standard_tags=True,
                         name='schema')

    assert schema.semantic_tags['id'] == {'category'}
    assert schema.semantic_tags['age'] == {'numeric'}

    schema = TableSchema(sample_column_names,
                         logical_types={
                             **sample_inferred_logical_types,
                             **{
                                 'id': Categorical
                             }
                         },
                         name='schema',
                         use_standard_tags=False)

    assert schema.semantic_tags['id'] == set()
    assert schema.semantic_tags['age'] == set()
def test_schema_repr_empty():
    schema = TableSchema([], {})
    assert repr(
        schema
    ) == 'Empty DataFrame\nColumns: [Logical Type, Semantic Tag(s)]\nIndex: []'

    assert schema._repr_html_(
    ) == '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Logical Type</th>\n      <th>Semantic Tag(s)</th>\n    </tr>\n    <tr>\n      <th>Column</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table>'
def test_reset_semantic_tags_invalid_column(sample_column_names,
                                            sample_inferred_logical_types):
    schema = TableSchema(
        sample_column_names,
        sample_inferred_logical_types,
    )
    error_msg = "Input contains columns that are not present in dataframe: 'invalid_column'"
    with pytest.raises(LookupError, match=error_msg):
        schema.reset_semantic_tags('invalid_column')
Beispiel #13
0
def test_reset_semantic_tags_invalid_column(sample_column_names,
                                            sample_inferred_logical_types):
    schema = TableSchema(
        sample_column_names,
        sample_inferred_logical_types,
    )
    error_msg = re.escape(
        "Column(s) '['invalid_column']' not found in DataFrame")
    with pytest.raises(ColumnNotPresentError, match=error_msg):
        schema.reset_semantic_tags("invalid_column")
def test_column_schema_metadata(sample_column_names,
                                sample_inferred_logical_types):
    column_metadata = {'metadata_field': [1, 2, 3], 'created_by': 'user0'}

    schema = TableSchema(sample_column_names, sample_inferred_logical_types)
    assert schema.columns['id'].metadata == {}

    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         column_metadata={'id': column_metadata})
    assert schema.columns['id'].metadata == column_metadata
def test_index_replacing_standard_tags(sample_column_names,
                                       sample_inferred_logical_types):
    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         use_standard_tags=True)
    assert schema.columns["id"].semantic_tags == {"numeric"}

    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         index="id")
    assert schema.columns["id"].semantic_tags == {"index"}
Beispiel #16
0
def test_index_replacing_standard_tags(sample_column_names,
                                       sample_inferred_logical_types):
    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         use_standard_tags=True)
    assert schema.columns['id'].semantic_tags == {'numeric'}

    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         index='id')
    assert schema.columns['id'].semantic_tags == {'index'}
Beispiel #17
0
def test_schema_repr(small_df):
    schema = TableSchema(list(small_df.columns),
                         logical_types={"sample_datetime_series": Datetime})

    schema_repr = repr(schema)
    expected_repr = "                       Logical Type Semantic Tag(s)\nColumn                                             \nsample_datetime_series     Datetime              []"
    assert schema_repr == expected_repr

    schema_html_repr = schema._repr_html_()
    expected_repr = '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Logical Type</th>\n      <th>Semantic Tag(s)</th>\n    </tr>\n    <tr>\n      <th>Column</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>sample_datetime_series</th>\n      <td>Datetime</td>\n      <td>[]</td>\n    </tr>\n  </tbody>\n</table>'
    assert schema_html_repr == expected_repr
def test_reset_all_semantic_tags(sample_column_names,
                                 sample_inferred_logical_types):
    semantic_tags = {'full_name': 'tag1', 'age': 'age'}
    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         semantic_tags=semantic_tags,
                         use_standard_tags=True)

    schema.reset_semantic_tags()
    assert schema.semantic_tags['full_name'] == set()
    assert schema.semantic_tags['age'] == {'numeric'}
Beispiel #19
0
def test_reset_all_semantic_tags(sample_column_names,
                                 sample_inferred_logical_types):
    semantic_tags = {"full_name": "tag1", "age": "age"}
    schema = TableSchema(
        sample_column_names,
        sample_inferred_logical_types,
        semantic_tags=semantic_tags,
        use_standard_tags=True,
    )

    schema.reset_semantic_tags()
    assert schema.semantic_tags["full_name"] == set()
    assert schema.semantic_tags["age"] == {"numeric"}
Beispiel #20
0
def test_column_schema_metadata(sample_column_names,
                                sample_inferred_logical_types):
    column_metadata = {"metadata_field": [1, 2, 3], "created_by": "user0"}

    schema = TableSchema(sample_column_names, sample_inferred_logical_types)
    assert schema.columns["id"].metadata == {}

    schema = TableSchema(
        sample_column_names,
        sample_inferred_logical_types,
        column_metadata={"id": column_metadata},
    )
    assert schema.columns["id"].metadata == column_metadata
def test_add_semantic_tags(sample_column_names, sample_inferred_logical_types):
    semantic_tags = {'full_name': 'tag1', 'age': ['numeric', 'age']}
    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         semantic_tags=semantic_tags,
                         use_standard_tags=False,
                         index='id')

    new_tags = {'full_name': ['list_tag'], 'age': 'str_tag', 'id': {'set_tag'}}
    schema.add_semantic_tags(new_tags)

    assert schema.semantic_tags['full_name'] == {'tag1', 'list_tag'}
    assert schema.semantic_tags['age'] == {'numeric', 'age', 'str_tag'}
    assert schema.semantic_tags['id'] == {'set_tag', 'index'}
Beispiel #22
0
def test_filter_schema_overlap_name_and_type(sample_column_names,
                                             sample_inferred_logical_types):
    schema = TableSchema(sample_column_names, sample_inferred_logical_types)

    filter_name_ltype_overlap = schema._filter_cols(include="full_name")
    assert filter_name_ltype_overlap == []

    filter_overlap_with_name = schema._filter_cols(include="full_name",
                                                   col_names=True)
    assert filter_overlap_with_name == ["full_name"]

    schema = TableSchema(
        sample_column_names,
        {
            **sample_inferred_logical_types,
            "full_name": Categorical,
            "age": PersonFullName,
        },
        semantic_tags={"id": "person_full_name"},
    )

    filter_tag_and_ltype = schema._filter_cols(include="person_full_name")
    assert set(filter_tag_and_ltype) == {"id", "age"}

    filter_all_three = schema._filter_cols(
        include=["person_full_name", "full_name"], col_names=True)
    assert set(filter_all_three) == {"id", "age", "full_name"}
Beispiel #23
0
def test_filter_schema_cols_no_matches(sample_column_names,
                                       sample_inferred_logical_types):
    schema = TableSchema(
        sample_column_names,
        sample_inferred_logical_types,
        time_index="signup_date",
        index="id",
        name="df_name",
    )

    filter_no_matches = schema._filter_cols(include="nothing")
    assert filter_no_matches == []

    filter_empty_list = schema._filter_cols(include=[])
    assert filter_empty_list == []

    filter_non_string = schema._filter_cols(include=1)
    assert filter_non_string == []

    filter_exclude_no_matches = schema._filter_cols(exclude="nothing")
    assert set(filter_exclude_no_matches) == set(sample_column_names)

    filter_exclude_empty_list = schema._filter_cols(exclude=[])
    assert set(filter_exclude_empty_list) == set(sample_column_names)

    filter_exclude_non_string = schema._filter_cols(exclude=1)
    assert set(filter_exclude_non_string) == set(sample_column_names)
Beispiel #24
0
def test_filter_schema_cols_include(sample_column_names,
                                    sample_inferred_logical_types):
    schema = TableSchema(
        sample_column_names,
        sample_inferred_logical_types,
        time_index="signup_date",
        index="id",
        name="df_name",
        use_standard_tags=True,
    )

    filtered = schema._filter_cols(include=Datetime)
    expected = {"signup_date", "datetime_with_NaT"}
    assert set(filtered) == expected

    filtered = schema._filter_cols(include="email", col_names=True)
    assert filtered == ["email"]

    filtered_log_type_string = schema._filter_cols(include="Unknown")
    filtered_log_type = schema._filter_cols(include=Unknown)
    expected = {"full_name"}
    assert filtered_log_type == filtered_log_type_string
    assert set(filtered_log_type) == expected
    expected = {
        "integer", "double", "double_with_nan", "age", "nullable_integer"
    }
    filtered_semantic_tag = schema._filter_cols(include="numeric")
    assert set(filtered_semantic_tag) == expected

    filtered_multiple_overlap = schema._filter_cols(
        include=["Unknown", "email"], col_names=True)
    expected = ["full_name", "phone_number", "email"]
    for col in filtered_multiple_overlap:
        assert col in expected
def test_schema_init_with_col_origins(sample_column_names,
                                      sample_inferred_logical_types):
    origins = {"age": "base", "signup_date": "engineered"}
    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         column_origins=origins)
    for name, column in schema.columns.items():
        assert column.origin == origins.get(name)

    schema_single_origin = TableSchema(sample_column_names,
                                       sample_inferred_logical_types,
                                       column_origins="base")
    for name, column in schema_single_origin.columns.items():
        assert column.origin == "base"
def test_filter_schema_cols_exclude(sample_column_names,
                                    sample_inferred_logical_types):
    schema = TableSchema(sample_column_names,
                         sample_inferred_logical_types,
                         time_index='signup_date',
                         index='id',
                         name='df_name',
                         use_standard_tags=True)

    filtered = schema._filter_cols(exclude=Datetime)
    assert 'signup_date' not in filtered

    filtered = schema._filter_cols(exclude='email', col_names=True)
    assert 'email' not in filtered

    filtered_log_type_string = schema._filter_cols(exclude='NaturalLanguage')
    filtered_log_type = schema._filter_cols(exclude=NaturalLanguage)
    expected = {'id', 'age', 'signup_date', 'is_registered'}
    assert filtered_log_type == filtered_log_type_string
    assert set(filtered_log_type) == expected

    filtered_semantic_tag = schema._filter_cols(exclude='numeric')
    assert 'age' not in filtered_semantic_tag

    filtered_multiple_overlap = schema._filter_cols(
        exclude=['NaturalLanguage', 'email'], col_names=True)
    expected = ['id', 'age', 'signup_date', 'is_registered']
    for col in filtered_multiple_overlap:
        assert col in expected
def test_filter_schema_non_string_cols():
    schema = TableSchema(column_names=[0, 1, 2, 3],
                         logical_types={
                             0: Integer,
                             1: Categorical,
                             2: NaturalLanguage,
                             3: Double
                         },
                         use_standard_tags=True)

    filter_types_and_tags = schema._filter_cols(include=[Integer, 'category'])
    assert filter_types_and_tags == [0, 1]

    filter_by_name = schema._filter_cols(include=[0, 1], col_names=True)
    assert filter_by_name == [0, 1]
Beispiel #28
0
def test_use_standard_tags_from_dict(sample_column_names,
                                     sample_inferred_logical_types):
    default_schema = TableSchema(sample_column_names,
                                 sample_inferred_logical_types,
                                 use_standard_tags={
                                     col_name: False
                                     for col_name in sample_column_names
                                 })
    assert default_schema.use_standard_tags == {
        col_name: False
        for col_name in sample_column_names
    }

    use_standard_tags = {
        'id': True,
        'full_name': False,
        'email': True,
        'phone_number': True,
        'age': False,
        'signup_date': True,
        'is_registered': False
    }
    full_dict_schema = TableSchema(sample_column_names,
                                   sample_inferred_logical_types,
                                   use_standard_tags=use_standard_tags)
    assert full_dict_schema.use_standard_tags == use_standard_tags

    partial_dict_schema = TableSchema(sample_column_names,
                                      sample_inferred_logical_types,
                                      use_standard_tags={
                                          'id': True,
                                          'email': True,
                                          'phone_number': True,
                                          'signup_date': True
                                      })
    assert full_dict_schema.use_standard_tags == partial_dict_schema.use_standard_tags
    assert full_dict_schema == partial_dict_schema

    partial_dict_default_schema = TableSchema(sample_column_names,
                                              sample_inferred_logical_types,
                                              use_standard_tags={
                                                  'id': False,
                                                  'email': False,
                                                  'phone_number': False,
                                                  'signup_date': False
                                              })
    assert default_schema.use_standard_tags == partial_dict_default_schema.use_standard_tags
    assert default_schema == partial_dict_default_schema
Beispiel #29
0
def test_add_semantic_tags(sample_column_names, sample_inferred_logical_types):
    semantic_tags = {"full_name": "tag1", "age": ["numeric", "age"]}
    schema = TableSchema(
        sample_column_names,
        sample_inferred_logical_types,
        semantic_tags=semantic_tags,
        use_standard_tags=False,
        index="id",
    )

    new_tags = {"full_name": ["list_tag"], "age": "str_tag", "id": {"set_tag"}}
    schema.add_semantic_tags(new_tags)

    assert schema.semantic_tags["full_name"] == {"tag1", "list_tag"}
    assert schema.semantic_tags["age"] == {"numeric", "age", "str_tag"}
    assert schema.semantic_tags["id"] == {"set_tag", "index"}
Beispiel #30
0
def test_schema_logical_types(sample_column_names,
                              sample_inferred_logical_types):
    schema = TableSchema(sample_column_names, sample_inferred_logical_types)
    assert isinstance(schema.logical_types, dict)
    assert set(schema.logical_types.keys()) == set(sample_column_names)
    for k, v in schema.logical_types.items():
        assert v == schema.columns[k].logical_type