def test_normalize_ww_init(): es = EntitySet() df = pd.DataFrame({ 'id': [1, 2, 3, 4], 'col': ['a', 'b', 'c', 'd'], 'df2_id': [1, 1, 2, 2], 'df2_col': [True, False, True, True] }) df.ww.init(index='id', name='test_name') es.add_dataframe(dataframe=df) assert es['test_name'].ww.name == 'test_name' assert es['test_name'].ww.schema.name == 'test_name' es.normalize_dataframe('test_name', 'new_df', 'df2_id', additional_columns=['df2_col']) assert es['test_name'].ww.name == 'test_name' assert es['test_name'].ww.schema.name == 'test_name' assert es['new_df'].ww.name == 'new_df' assert es['new_df'].ww.schema.name == 'new_df'
def test_normalize_ww_init(): es = EntitySet() df = pd.DataFrame({ "id": [1, 2, 3, 4], "col": ["a", "b", "c", "d"], "df2_id": [1, 1, 2, 2], "df2_col": [True, False, True, True], }) df.ww.init(index="id", name="test_name") es.add_dataframe(dataframe=df) assert es["test_name"].ww.name == "test_name" assert es["test_name"].ww.schema.name == "test_name" es.normalize_dataframe("test_name", "new_df", "df2_id", additional_columns=["df2_col"]) assert es["test_name"].ww.name == "test_name" assert es["test_name"].ww.schema.name == "test_name" assert es["new_df"].ww.name == "new_df" assert es["new_df"].ww.schema.name == "new_df"
def test_extra_woodwork_params(es): new_es = EntitySet() sessions_df = es["sessions"].ww.copy() assert sessions_df.ww.index == "id" assert sessions_df.ww.time_index is None assert isinstance(sessions_df.ww.logical_types["id"], Integer) warning_msg = ( "A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: " "index, time_index, logical_types, make_index, semantic_tags, already_sorted" ) with pytest.warns(UserWarning, match=warning_msg): new_es.add_dataframe( dataframe_name="sessions", dataframe=sessions_df, index="filepath", time_index="customer_id", logical_types={"id": Categorical}, make_index=True, already_sorted=True, semantic_tags={"id": "new_tag"}, ) assert sessions_df.ww.index == "id" assert sessions_df.ww.time_index is None assert isinstance(sessions_df.ww.logical_types["id"], Integer) assert "new_tag" not in sessions_df.ww.semantic_tags
def test_replace_dataframe(): df = pd.DataFrame({ 'id': range(4), 'full_name': ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'], 'email': [ '*****@*****.**', np.nan, '*****@*****.**', '*****@*****.**' ], 'phone_number': ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'], 'age': pd.Series([33, None, 33, 57], dtype='Int64'), 'signup_date': [pd.to_datetime('2020-09-01')] * 4, 'is_registered': pd.Series([True, False, True, None], dtype='boolean'), }) df.ww.init(name='table', index='id') es = EntitySet('es') es.add_dataframe(df) original_schema = es['table'].ww.schema new_df = df.iloc[2:] es.replace_dataframe('table', new_df) assert len(es['table']) == 2 assert es['table'].ww.schema == original_schema
def test_extra_woodwork_params(es): new_es = EntitySet() sessions_df = es['sessions'].ww.copy() assert sessions_df.ww.index == 'id' assert sessions_df.ww.time_index is None assert isinstance(sessions_df.ww.logical_types['id'], Integer) warning_msg = ( 'A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: ' 'index, time_index, logical_types, make_index, semantic_tags, already_sorted' ) with pytest.warns(UserWarning, match=warning_msg): new_es.add_dataframe(dataframe_name='sessions', dataframe=sessions_df, index='filepath', time_index='customer_id', logical_types={'id': Categorical}, make_index=True, already_sorted=True, semantic_tags={'id': 'new_tag'}) assert sessions_df.ww.index == 'id' assert sessions_df.ww.time_index is None assert isinstance(sessions_df.ww.logical_types['id'], Integer) assert 'new_tag' not in sessions_df.ww.semantic_tags
def test_replace_dataframe_data_transformation(latlong_df): initial_df = latlong_df.copy() initial_df.ww.init( name='latlongs', index='string_tuple', logical_types={col_name: 'LatLong' for col_name in initial_df.columns}) es = EntitySet() es.add_dataframe(dataframe=initial_df) df = to_pandas(es['latlongs']) expected_val = (1, 2) if ks and isinstance(es['latlongs'], ks.DataFrame): expected_val = [1, 2] for col in latlong_df.columns: series = df[col] assert series.iloc[0] == expected_val es.replace_dataframe('latlongs', latlong_df) df = to_pandas(es['latlongs']) expected_val = (3, 4) if ks and isinstance(es['latlongs'], ks.DataFrame): expected_val = [3, 4] for col in latlong_df.columns: series = df[col] assert series.iloc[-1] == expected_val
def test_replace_dataframe_different_dataframe_types(): dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({ "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_logical_types = { "id": Integer, "user": Integer, "time": Datetime, "strings": NaturalLanguage } sessions_semantic_tags = {'user': '******'} dask_es.add_dataframe(dataframe_name="sessions", dataframe=sessions_dask, index="id", time_index="time", logical_types=sessions_logical_types, semantic_tags=sessions_semantic_tags) with pytest.raises(TypeError, match='Incorrect DataFrame type used'): dask_es.replace_dataframe('sessions', sessions)
def test_normalize_dataframe(): df = pd.DataFrame({ 'id': range(4), 'full_name': ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'], 'email': [ '*****@*****.**', np.nan, '*****@*****.**', '*****@*****.**' ], 'phone_number': ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'], 'age': pd.Series([33, None, 33, 57], dtype='Int64'), 'signup_date': [pd.to_datetime('2020-09-01')] * 4, 'is_registered': pd.Series([True, False, True, None], dtype='boolean'), }) df.ww.init(name='first_table', index='id', time_index='signup_date') es = EntitySet('es') es.add_dataframe(df) es.normalize_dataframe('first_table', 'second_table', 'age', additional_columns=['phone_number', 'full_name'], make_time_index=True) assert len(es.dataframe_dict) == 2 assert 'foreign_key' in es['first_table'].ww.semantic_tags['age']
def test_dataframe_without_name(es): new_es = EntitySet() new_df = es['sessions'].copy() assert new_df.ww.schema is None error = 'Cannot add dataframe to EntitySet without a name. Please provide a value for the dataframe_name parameter.' with pytest.raises(ValueError, match=error): new_es.add_dataframe(new_df)
def test_woodwork_dataframe_without_name_errors(es): new_es = EntitySet() new_df = es['sessions'].ww.copy() new_df.ww._schema.name = None assert new_df.ww.name is None error = 'Cannot add a Woodwork DataFrame to EntitySet without a name' with pytest.raises(ValueError, match=error): new_es.add_dataframe(new_df)
def test_woodwork_dataframe_same_name_parameter(es): new_es = EntitySet() new_df = es["sessions"].ww.copy() new_df.ww._schema.name = "df_name" assert new_df.ww.name == "df_name" new_es.add_dataframe(new_df, dataframe_name="df_name") assert new_es["df_name"].ww.name == "df_name"
def test_woodwork_dataframe_same_name_parameter(es): new_es = EntitySet() new_df = es['sessions'].ww.copy() new_df.ww._schema.name = 'df_name' assert new_df.ww.name == 'df_name' new_es.add_dataframe(new_df, dataframe_name='df_name') assert new_es['df_name'].ww.name == 'df_name'
def test_dataframe_with_name_parameter(es): new_es = EntitySet() new_df = es['sessions'][['id']] assert new_df.ww.schema is None new_es.add_dataframe(new_df, dataframe_name='df_name', index='id', logical_types={'id': 'Integer'}) assert new_es['df_name'].ww.name == 'df_name'
def test_add_secondary_time_index(dates_df): dates_df.ww.init(name='dates_table', index='backwards_order', time_index='dates_backwards') es = EntitySet('es') es.add_dataframe( dates_df, secondary_time_index={'repeating_dates': ['random_order', 'special']}) assert dates_df.ww.metadata['secondary_time_index'] == { 'repeating_dates': ['random_order', 'special', 'repeating_dates'] }
def test_dataframe_with_name_parameter(es): new_es = EntitySet() new_df = es["sessions"][["id"]] assert new_df.ww.schema is None new_es.add_dataframe(new_df, dataframe_name="df_name", index="id", logical_types={"id": "Integer"}) assert new_es["df_name"].ww.name == "df_name"
def test_add_secondary_time_index(dates_df): dates_df.ww.init(name="dates_table", index="backwards_order", time_index="dates_backwards") es = EntitySet("es") es.add_dataframe( dates_df, secondary_time_index={"repeating_dates": ["random_order", "special"]}) assert dates_df.ww.metadata["secondary_time_index"] == { "repeating_dates": ["random_order", "special", "repeating_dates"] }
def test_woodwork_dataframe_ignore_conflicting_name_parameter_warning(es): new_es = EntitySet() new_df = es['sessions'].ww.copy() new_df.ww._schema.name = 'df_name' assert new_df.ww.name == 'df_name' warning = 'A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: dataframe_name' with pytest.warns(UserWarning, match=warning): new_es.add_dataframe(new_df, dataframe_name='conflicting_name') assert new_es['df_name'].ww.name == 'df_name'
def test_time_type_check_order(dates_df): dates_df.ww.init(name='dates_table', index='backwards_order', time_index='random_order') es = EntitySet('es') error = 'dates_table time index is Datetime type which differs from other entityset time indexes' with pytest.raises(TypeError, match=error): es.add_dataframe(dates_df, secondary_time_index={ 'repeating_dates': ['random_order', 'special'] }) assert 'secondary_time_index' not in dates_df.ww.metadata
def test_init_with_mismatched_time_types(dates_df): dates_df.ww.init(name='dates_table', index='backwards_order', time_index='repeating_dates') es = EntitySet('es') es.add_dataframe(dates_df, secondary_time_index={'special_dates': ['special']}) assert es.time_type == Datetime nums_df = pd.DataFrame({'id': [1, 2, 3], 'times': [9, 8, 7]}) nums_df.ww.init(name='numerics_table', index='id', time_index='times') error = 'numerics_table time index is numeric type which differs from other entityset time indexes' with pytest.raises(TypeError, match=error): es.add_dataframe(nums_df)
def test_init_with_mismatched_time_types(dates_df): dates_df.ww.init(name="dates_table", index="backwards_order", time_index="repeating_dates") es = EntitySet("es") es.add_dataframe(dates_df, secondary_time_index={"special_dates": ["special"]}) assert es.time_type == Datetime nums_df = pd.DataFrame({"id": [1, 2, 3], "times": [9, 8, 7]}) nums_df.ww.init(name="numerics_table", index="id", time_index="times") error = "numerics_table time index is numeric type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error): es.add_dataframe(nums_df)
def test_time_type_check_order(dates_df): dates_df.ww.init(name="dates_table", index="backwards_order", time_index="random_order") es = EntitySet("es") error = "dates_table time index is Datetime type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error): es.add_dataframe( dates_df, secondary_time_index={ "repeating_dates": ["random_order", "special"] }, ) assert "secondary_time_index" not in dates_df.ww.metadata
def test_add_dataframe_to_es(df): es1 = EntitySet('es') assert es1.dataframe_dict == {} es1.add_dataframe(df, dataframe_name='table', index='id', semantic_tags={'category': 'new_tag'}) assert len(es1.dataframe_dict) == 1 copy_df = df.ww.copy() es2 = EntitySet('es') assert es2.dataframe_dict == {} es2.add_dataframe(copy_df) assert len(es2.dataframe_dict) == 1 assert es1['table'].ww == es2['table'].ww
def test_add_dataframe_to_es(df): es1 = EntitySet("es") assert es1.dataframe_dict == {} es1.add_dataframe(df, dataframe_name="table", index="id", semantic_tags={"category": "new_tag"}) assert len(es1.dataframe_dict) == 1 copy_df = df.ww.copy() es2 = EntitySet("es") assert es2.dataframe_dict == {} es2.add_dataframe(copy_df) assert len(es2.dataframe_dict) == 1 assert es1["table"].ww == es2["table"].ww
def test_int_double_time_type(dates_df): dates_df.ww.init(name='dates_table', index='backwards_order', time_index='random_order', logical_types={ 'random_order': 'Integer', 'special': 'Double' }) es = EntitySet('es') # Both random_order and special are numeric, but they are different logical types es.add_dataframe(dates_df, secondary_time_index={'special': ['dates_backwards']}) assert isinstance(es['dates_table'].ww.logical_types['random_order'], Integer) assert isinstance(es['dates_table'].ww.logical_types['special'], Double) assert es['dates_table'].ww.time_index == 'random_order' assert 'special' in es['dates_table'].ww.metadata['secondary_time_index']
def test_normalize_dataframe(): df = pd.DataFrame({ "id": range(4), "full_name": [ "Mr. John Doe", "Doe, Mrs. Jane", "James Brown", "Ms. Paige Turner", ], "email": [ "*****@*****.**", np.nan, "*****@*****.**", "*****@*****.**", ], "phone_number": [ "5555555555", "555-555-5555", "1-(555)-555-5555", "555-555-5555", ], "age": pd.Series([33, None, 33, 57], dtype="Int64"), "signup_date": [pd.to_datetime("2020-09-01")] * 4, "is_registered": pd.Series([True, False, True, None], dtype="boolean"), }) df.ww.init(name="first_table", index="id", time_index="signup_date") es = EntitySet("es") es.add_dataframe(df) es.normalize_dataframe( "first_table", "second_table", "age", additional_columns=["phone_number", "full_name"], make_time_index=True, ) assert len(es.dataframe_dict) == 2 assert "foreign_key" in es["first_table"].ww.semantic_tags["age"]
def test_int_double_time_type(dates_df): dates_df.ww.init( name="dates_table", index="backwards_order", time_index="random_order", logical_types={ "random_order": "Integer", "special": "Double" }, ) es = EntitySet("es") # Both random_order and special are numeric, but they are different logical types es.add_dataframe(dates_df, secondary_time_index={"special": ["dates_backwards"]}) assert isinstance(es["dates_table"].ww.logical_types["random_order"], Integer) assert isinstance(es["dates_table"].ww.logical_types["special"], Double) assert es["dates_table"].ww.time_index == "random_order" assert "special" in es["dates_table"].ww.metadata["secondary_time_index"]
def test_replace_dataframe(): df = pd.DataFrame({ "id": range(4), "full_name": [ "Mr. John Doe", "Doe, Mrs. Jane", "James Brown", "Ms. Paige Turner", ], "email": [ "*****@*****.**", np.nan, "*****@*****.**", "*****@*****.**", ], "phone_number": [ "5555555555", "555-555-5555", "1-(555)-555-5555", "555-555-5555", ], "age": pd.Series([33, None, 33, 57], dtype="Int64"), "signup_date": [pd.to_datetime("2020-09-01")] * 4, "is_registered": pd.Series([True, False, True, None], dtype="boolean"), }) df.ww.init(name="table", index="id") es = EntitySet("es") es.add_dataframe(df) original_schema = es["table"].ww.schema new_df = df.iloc[2:] es.replace_dataframe("table", new_df) assert len(es["table"]) == 2 assert es["table"].ww.schema == original_schema
def test_add_time_index_through_woodwork_different_type(dates_df): dates_df.ww.init(name='dates_table', index='backwards_order', time_index='dates_backwards') es = EntitySet('es') es.add_dataframe( dates_df, secondary_time_index={'repeating_dates': ['random_order', 'special']}) assert dates_df.ww.metadata['secondary_time_index'] == { 'repeating_dates': ['random_order', 'special', 'repeating_dates'] } assert es.time_type == Datetime assert es._check_uniform_time_index(es['dates_table']) is None dates_df.ww.set_time_index('random_order') assert dates_df.ww.time_index == 'random_order' error = 'dates_table time index is numeric type which differs from other entityset time indexes' with pytest.raises(TypeError, match=error): es._check_uniform_time_index(es['dates_table'])
def test_add_time_index_through_woodwork_different_type(dates_df): dates_df.ww.init(name="dates_table", index="backwards_order", time_index="dates_backwards") es = EntitySet("es") es.add_dataframe( dates_df, secondary_time_index={"repeating_dates": ["random_order", "special"]}) assert dates_df.ww.metadata["secondary_time_index"] == { "repeating_dates": ["random_order", "special", "repeating_dates"] } assert es.time_type == Datetime assert es._check_uniform_time_index(es["dates_table"]) is None dates_df.ww.set_time_index("random_order") assert dates_df.ww.time_index == "random_order" error = "dates_table time index is numeric type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error): es._check_uniform_time_index(es["dates_table"])