Example #1
0
def test_extra_woodwork_params(es):
    new_es = EntitySet()

    sessions_df = es["sessions"].ww.copy()

    assert sessions_df.ww.index == "id"
    assert sessions_df.ww.time_index is None
    assert isinstance(sessions_df.ww.logical_types["id"], Integer)

    warning_msg = (
        "A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: "
        "index, time_index, logical_types, make_index, semantic_tags, already_sorted"
    )
    with pytest.warns(UserWarning, match=warning_msg):
        new_es.add_dataframe(
            dataframe_name="sessions",
            dataframe=sessions_df,
            index="filepath",
            time_index="customer_id",
            logical_types={"id": Categorical},
            make_index=True,
            already_sorted=True,
            semantic_tags={"id": "new_tag"},
        )
    assert sessions_df.ww.index == "id"
    assert sessions_df.ww.time_index is None
    assert isinstance(sessions_df.ww.logical_types["id"], Integer)
    assert "new_tag" not in sessions_df.ww.semantic_tags
Example #2
0
def test_init_es_with_relationships(pd_df):
    second_df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "first_table_id": [1, 2, 2, 1]
    })

    pd_df.ww.init(name="first_table", index="id")
    second_df.ww.init(name="second_table", index="id")

    es = EntitySet(
        "es",
        dataframes={
            "first_table": (pd_df, ),
            "second_table": (second_df, )
        },
        relationships=[("first_table", "id", "second_table", "first_table_id")
                       ],
    )

    assert len(es.relationships) == 1

    forward_dataframes = [
        name for name, _ in es.get_forward_dataframes("second_table")
    ]
    assert forward_dataframes[0] == "first_table"

    relationship = es.relationships[0]
    assert "foreign_key" in relationship.child_column.ww.semantic_tags
    assert "index" in relationship.parent_column.ww.semantic_tags
Example #3
0
def test_init_es_with_relationships(pd_df):
    second_df = pd.DataFrame({
        'id': [0, 1, 2, 3],
        'first_table_id': [1, 2, 2, 1]
    })

    pd_df.ww.init(name='first_table', index='id')
    second_df.ww.init(name='second_table', index='id')

    es = EntitySet('es',
                   dataframes={
                       'first_table': (pd_df, ),
                       'second_table': (second_df, )
                   },
                   relationships=[('first_table', 'id', 'second_table',
                                   'first_table_id')])

    assert len(es.relationships) == 1

    forward_dataframes = [
        name for name, _ in es.get_forward_dataframes('second_table')
    ]
    assert forward_dataframes[0] == 'first_table'

    relationship = es.relationships[0]
    assert 'foreign_key' in relationship.child_column.ww.semantic_tags
    assert 'index' in relationship.parent_column.ww.semantic_tags
Example #4
0
def test_extra_woodwork_params(es):
    new_es = EntitySet()

    sessions_df = es['sessions'].ww.copy()

    assert sessions_df.ww.index == 'id'
    assert sessions_df.ww.time_index is None
    assert isinstance(sessions_df.ww.logical_types['id'], Integer)

    warning_msg = (
        'A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: '
        'index, time_index, logical_types, make_index, semantic_tags, already_sorted'
    )
    with pytest.warns(UserWarning, match=warning_msg):
        new_es.add_dataframe(dataframe_name='sessions',
                             dataframe=sessions_df,
                             index='filepath',
                             time_index='customer_id',
                             logical_types={'id': Categorical},
                             make_index=True,
                             already_sorted=True,
                             semantic_tags={'id': 'new_tag'})
    assert sessions_df.ww.index == 'id'
    assert sessions_df.ww.time_index is None
    assert isinstance(sessions_df.ww.logical_types['id'], Integer)
    assert 'new_tag' not in sessions_df.ww.semantic_tags
Example #5
0
def test_dataframe_without_name(es):
    new_es = EntitySet()

    new_df = es['sessions'].copy()

    assert new_df.ww.schema is None

    error = 'Cannot add dataframe to EntitySet without a name. Please provide a value for the dataframe_name parameter.'
    with pytest.raises(ValueError, match=error):
        new_es.add_dataframe(new_df)
Example #6
0
def test_woodwork_dataframe_same_name_parameter(es):
    new_es = EntitySet()

    new_df = es['sessions'].ww.copy()
    new_df.ww._schema.name = 'df_name'

    assert new_df.ww.name == 'df_name'

    new_es.add_dataframe(new_df, dataframe_name='df_name')

    assert new_es['df_name'].ww.name == 'df_name'
Example #7
0
def test_woodwork_dataframe_without_name_errors(es):
    new_es = EntitySet()

    new_df = es['sessions'].ww.copy()
    new_df.ww._schema.name = None

    assert new_df.ww.name is None

    error = 'Cannot add a Woodwork DataFrame to EntitySet without a name'
    with pytest.raises(ValueError, match=error):
        new_es.add_dataframe(new_df)
Example #8
0
def test_woodwork_dataframe_same_name_parameter(es):
    new_es = EntitySet()

    new_df = es["sessions"].ww.copy()
    new_df.ww._schema.name = "df_name"

    assert new_df.ww.name == "df_name"

    new_es.add_dataframe(new_df, dataframe_name="df_name")

    assert new_es["df_name"].ww.name == "df_name"
Example #9
0
def test_add_secondary_time_index(dates_df):
    dates_df.ww.init(name="dates_table",
                     index="backwards_order",
                     time_index="dates_backwards")
    es = EntitySet("es")
    es.add_dataframe(
        dates_df,
        secondary_time_index={"repeating_dates": ["random_order", "special"]})

    assert dates_df.ww.metadata["secondary_time_index"] == {
        "repeating_dates": ["random_order", "special", "repeating_dates"]
    }
Example #10
0
def test_dataframe_with_name_parameter(es):
    new_es = EntitySet()

    new_df = es["sessions"][["id"]]

    assert new_df.ww.schema is None

    new_es.add_dataframe(new_df,
                         dataframe_name="df_name",
                         index="id",
                         logical_types={"id": "Integer"})
    assert new_es["df_name"].ww.name == "df_name"
Example #11
0
def test_dataframe_with_name_parameter(es):
    new_es = EntitySet()

    new_df = es['sessions'][['id']]

    assert new_df.ww.schema is None

    new_es.add_dataframe(new_df,
                         dataframe_name='df_name',
                         index='id',
                         logical_types={'id': 'Integer'})
    assert new_es['df_name'].ww.name == 'df_name'
Example #12
0
def test_add_secondary_time_index(dates_df):
    dates_df.ww.init(name='dates_table',
                     index='backwards_order',
                     time_index='dates_backwards')
    es = EntitySet('es')
    es.add_dataframe(
        dates_df,
        secondary_time_index={'repeating_dates': ['random_order', 'special']})

    assert dates_df.ww.metadata['secondary_time_index'] == {
        'repeating_dates': ['random_order', 'special', 'repeating_dates']
    }
Example #13
0
def test_woodwork_dataframe_ignore_conflicting_name_parameter_warning(es):
    new_es = EntitySet()

    new_df = es['sessions'].ww.copy()
    new_df.ww._schema.name = 'df_name'

    assert new_df.ww.name == 'df_name'

    warning = 'A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: dataframe_name'
    with pytest.warns(UserWarning, match=warning):
        new_es.add_dataframe(new_df, dataframe_name='conflicting_name')

    assert new_es['df_name'].ww.name == 'df_name'
Example #14
0
def test_time_type_check_order(dates_df):
    dates_df.ww.init(name='dates_table',
                     index='backwards_order',
                     time_index='random_order')
    es = EntitySet('es')

    error = 'dates_table time index is Datetime type which differs from other entityset time indexes'
    with pytest.raises(TypeError, match=error):
        es.add_dataframe(dates_df,
                         secondary_time_index={
                             'repeating_dates': ['random_order', 'special']
                         })

    assert 'secondary_time_index' not in dates_df.ww.metadata
Example #15
0
def test_init_es_with_multiple_dataframes(pd_df):
    second_df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "first_table_id": [1, 2, 2, 1]
    })

    pd_df.ww.init(name="first_table", index="id")

    es = EntitySet(
        "es",
        dataframes={
            "first_table": (pd_df, ),
            "second_table": (
                second_df,
                "id",
                None,
                None,
                {
                    "first_table_id": "foreign_key"
                },
            ),
        },
    )

    assert len(es.dataframe_dict) == 2
    assert es["first_table"].ww.schema is not None
    assert es["second_table"].ww.schema is not None
Example #16
0
def test_time_type_check_order(dates_df):
    dates_df.ww.init(name="dates_table",
                     index="backwards_order",
                     time_index="random_order")
    es = EntitySet("es")

    error = "dates_table time index is Datetime type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error):
        es.add_dataframe(
            dates_df,
            secondary_time_index={
                "repeating_dates": ["random_order", "special"]
            },
        )

    assert "secondary_time_index" not in dates_df.ww.metadata
Example #17
0
def read_entityset(path, load_data=True):
    from featuretools.entityset.entityset import EntitySet
    data_root = os.path.abspath(os.path.expanduser(path))
    with open(os.path.join(data_root, 'metadata.json')) as f:
        metadata = json.load(f)
    if not load_data:
        data_root = None
    return EntitySet.from_metadata(metadata, data_root=data_root)
Example #18
0
def test_change_es_dataframe_schema(df):
    df.ww.init(index="id", name="table")
    es = EntitySet("es", dataframes={"table": (df, )})

    assert es["table"].ww.index == "id"

    es["table"].ww.set_index("category")
    assert es["table"].ww.index == "category"
Example #19
0
def test_change_es_dataframe_schema(df):
    df.ww.init(index='id', name='table')
    es = EntitySet('es', dataframes={'table': (df, )})

    assert es['table'].ww.index == 'id'

    es['table'].ww.set_index('category')
    assert es['table'].ww.index == 'category'
Example #20
0
def read_entityset(path, load_data=True):
    from featuretools.entityset.entityset import EntitySet
    data_root = os.path.abspath(os.path.expanduser(path))
    with open(os.path.join(data_root, 'metadata.json')) as f:
        metadata = json.load(f)
    if not load_data:
        data_root = None
    return EntitySet.from_metadata(metadata, data_root=data_root)
Example #21
0
def test_init_es_with_dataframe(df):
    es = EntitySet('es', dataframes={'table': (df, 'id')})
    assert es.id == 'es'
    assert len(es.dataframe_dict) == 1
    assert es['table'] is df

    assert es['table'].ww.schema is not None
    assert isinstance(es['table'].ww.logical_types['id'], Integer)
    assert isinstance(es['table'].ww.logical_types['category'], Categorical)
Example #22
0
def test_init_es_with_dataframe(df):
    es = EntitySet("es", dataframes={"table": (df, "id")})
    assert es.id == "es"
    assert len(es.dataframe_dict) == 1
    assert es["table"] is df

    assert es["table"].ww.schema is not None
    assert isinstance(es["table"].ww.logical_types["id"], Integer)
    assert isinstance(es["table"].ww.logical_types["category"], Categorical)
Example #23
0
def test_int_double_time_type(dates_df):
    dates_df.ww.init(name='dates_table',
                     index='backwards_order',
                     time_index='random_order',
                     logical_types={
                         'random_order': 'Integer',
                         'special': 'Double'
                     })
    es = EntitySet('es')

    # Both random_order and special are numeric, but they are different logical types
    es.add_dataframe(dates_df,
                     secondary_time_index={'special': ['dates_backwards']})

    assert isinstance(es['dates_table'].ww.logical_types['random_order'],
                      Integer)
    assert isinstance(es['dates_table'].ww.logical_types['special'], Double)

    assert es['dates_table'].ww.time_index == 'random_order'
    assert 'special' in es['dates_table'].ww.metadata['secondary_time_index']
Example #24
0
def test_normalize_ww_init():
    es = EntitySet()
    df = pd.DataFrame({
        'id': [1, 2, 3, 4],
        'col': ['a', 'b', 'c', 'd'],
        'df2_id': [1, 1, 2, 2],
        'df2_col': [True, False, True, True]
    })

    df.ww.init(index='id', name='test_name')
    es.add_dataframe(dataframe=df)

    assert es['test_name'].ww.name == 'test_name'
    assert es['test_name'].ww.schema.name == 'test_name'

    es.normalize_dataframe('test_name',
                           'new_df',
                           'df2_id',
                           additional_columns=['df2_col'])

    assert es['test_name'].ww.name == 'test_name'
    assert es['test_name'].ww.schema.name == 'test_name'

    assert es['new_df'].ww.name == 'new_df'
    assert es['new_df'].ww.schema.name == 'new_df'
Example #25
0
def test_replace_dataframe_different_dataframe_types():
    dask_es = EntitySet(id="dask_es")

    sessions = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "user": [1, 2, 1, 3],
        "time": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    sessions_dask = dd.from_pandas(sessions, npartitions=2)
    sessions_logical_types = {
        "id": Integer,
        "user": Integer,
        "time": Datetime,
        "strings": NaturalLanguage
    }
    sessions_semantic_tags = {'user': '******'}

    dask_es.add_dataframe(dataframe_name="sessions",
                          dataframe=sessions_dask,
                          index="id",
                          time_index="time",
                          logical_types=sessions_logical_types,
                          semantic_tags=sessions_semantic_tags)

    with pytest.raises(TypeError, match='Incorrect DataFrame type used'):
        dask_es.replace_dataframe('sessions', sessions)
Example #26
0
def test_replace_dataframe_data_transformation(latlong_df):
    initial_df = latlong_df.copy()
    initial_df.ww.init(
        name='latlongs',
        index='string_tuple',
        logical_types={col_name: 'LatLong'
                       for col_name in initial_df.columns})
    es = EntitySet()
    es.add_dataframe(dataframe=initial_df)

    df = to_pandas(es['latlongs'])
    expected_val = (1, 2)
    if ks and isinstance(es['latlongs'], ks.DataFrame):
        expected_val = [1, 2]
    for col in latlong_df.columns:
        series = df[col]
        assert series.iloc[0] == expected_val

    es.replace_dataframe('latlongs', latlong_df)
    df = to_pandas(es['latlongs'])
    expected_val = (3, 4)
    if ks and isinstance(es['latlongs'], ks.DataFrame):
        expected_val = [3, 4]
    for col in latlong_df.columns:
        series = df[col]
        assert series.iloc[-1] == expected_val
Example #27
0
def test_normalize_ww_init():
    es = EntitySet()
    df = pd.DataFrame({
        "id": [1, 2, 3, 4],
        "col": ["a", "b", "c", "d"],
        "df2_id": [1, 1, 2, 2],
        "df2_col": [True, False, True, True],
    })

    df.ww.init(index="id", name="test_name")
    es.add_dataframe(dataframe=df)

    assert es["test_name"].ww.name == "test_name"
    assert es["test_name"].ww.schema.name == "test_name"

    es.normalize_dataframe("test_name",
                           "new_df",
                           "df2_id",
                           additional_columns=["df2_col"])

    assert es["test_name"].ww.name == "test_name"
    assert es["test_name"].ww.schema.name == "test_name"

    assert es["new_df"].ww.name == "new_df"
    assert es["new_df"].ww.schema.name == "new_df"
Example #28
0
def test_normalize_dataframe():
    df = pd.DataFrame({
        'id':
        range(4),
        'full_name':
        ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'],
        'email': [
            '*****@*****.**', np.nan, '*****@*****.**',
            '*****@*****.**'
        ],
        'phone_number':
        ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'],
        'age':
        pd.Series([33, None, 33, 57], dtype='Int64'),
        'signup_date': [pd.to_datetime('2020-09-01')] * 4,
        'is_registered':
        pd.Series([True, False, True, None], dtype='boolean'),
    })

    df.ww.init(name='first_table', index='id', time_index='signup_date')
    es = EntitySet('es')
    es.add_dataframe(df)
    es.normalize_dataframe('first_table',
                           'second_table',
                           'age',
                           additional_columns=['phone_number', 'full_name'],
                           make_time_index=True)
    assert len(es.dataframe_dict) == 2
    assert 'foreign_key' in es['first_table'].ww.semantic_tags['age']
Example #29
0
def test_replace_dataframe():
    df = pd.DataFrame({
        'id':
        range(4),
        'full_name':
        ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'],
        'email': [
            '*****@*****.**', np.nan, '*****@*****.**',
            '*****@*****.**'
        ],
        'phone_number':
        ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'],
        'age':
        pd.Series([33, None, 33, 57], dtype='Int64'),
        'signup_date': [pd.to_datetime('2020-09-01')] * 4,
        'is_registered':
        pd.Series([True, False, True, None], dtype='boolean'),
    })

    df.ww.init(name='table', index='id')
    es = EntitySet('es')
    es.add_dataframe(df)
    original_schema = es['table'].ww.schema

    new_df = df.iloc[2:]
    es.replace_dataframe('table', new_df)

    assert len(es['table']) == 2
    assert es['table'].ww.schema == original_schema
Example #30
0
def test_int_double_time_type(dates_df):
    dates_df.ww.init(
        name="dates_table",
        index="backwards_order",
        time_index="random_order",
        logical_types={
            "random_order": "Integer",
            "special": "Double"
        },
    )
    es = EntitySet("es")

    # Both random_order and special are numeric, but they are different logical types
    es.add_dataframe(dates_df,
                     secondary_time_index={"special": ["dates_backwards"]})

    assert isinstance(es["dates_table"].ww.logical_types["random_order"],
                      Integer)
    assert isinstance(es["dates_table"].ww.logical_types["special"], Double)

    assert es["dates_table"].ww.time_index == "random_order"
    assert "special" in es["dates_table"].ww.metadata["secondary_time_index"]
Example #31
0
def test_init_es_with_woodwork_table_same_name(df):
    df.ww.init(index='id', name='table')
    es = EntitySet('es', dataframes={'table': (df, )})

    assert es.id == 'es'
    assert len(es.dataframe_dict) == 1
    assert es['table'] is df

    assert es['table'].ww.schema is not None

    assert es['table'].ww.index == 'id'
    assert es['table'].ww.time_index is None

    assert isinstance(es['table'].ww.logical_types['id'], Integer)
    assert isinstance(es['table'].ww.logical_types['category'], Categorical)