def test_add_relationship_errors_on_dtype_mismatch(entityset): log_2_df = entityset['log'].df.copy() log_variable_types = { 'id': variable_types.Categorical, 'session_id': variable_types.Id, 'product_id': variable_types.Id, 'datetime': variable_types.Datetime, 'value': variable_types.Numeric, 'value_2': variable_types.Numeric, 'latlong': variable_types.LatLong, 'latlong2': variable_types.LatLong, 'value_many_nans': variable_types.Numeric, 'priority_level': variable_types.Ordinal, 'purchased': variable_types.Boolean, 'comments': variable_types.Text } entityset.entity_from_dataframe(entity_id='log2', dataframe=log_2_df, index='id', variable_types=log_variable_types, time_index='datetime', encoding='utf-8') with pytest.raises(ValueError): mismatch = Relationship(entityset[u'régions']['id'], entityset['log2']['session_id']) entityset.add_relationship(mismatch) with pytest.raises(ValueError): mismatch = Relationship(entityset[u'régions']['id'], entityset['log2']['session_id']) entityset.add_relationship(mismatch)
def test_add_last_time_indexes(): pd_es = EntitySet(id="pd_es") dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({"id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25')], "strings": ["I am a string", "23", "abcdef ghijk", ""]}) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_vtypes = { "id": ft.variable_types.Id, "user": ft.variable_types.Id, "time": ft.variable_types.DatetimeTimeIndex, "strings": ft.variable_types.NaturalLanguage } transactions = pd.DataFrame({"id": [0, 1, 2, 3, 4, 5], "session_id": [0, 0, 1, 2, 2, 3], "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], "time": [pd.to_datetime('2019-01-10 03:53'), pd.to_datetime('2019-01-10 04:12'), pd.to_datetime('2019-02-03 10:34'), pd.to_datetime('2019-01-01 12:35'), pd.to_datetime('2019-01-01 12:49'), pd.to_datetime('2017-08-25 04:53')]}) transactions_dask = dd.from_pandas(transactions, npartitions=2) transactions_vtypes = { "id": ft.variable_types.Id, "session_id": ft.variable_types.Id, "amount": ft.variable_types.Numeric, "time": ft.variable_types.DatetimeTimeIndex, } pd_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions, index="id", time_index="time") dask_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions_dask, index="id", time_index="time", variable_types=sessions_vtypes) pd_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions, index="id", time_index="time") dask_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_dask, index="id", time_index="time", variable_types=transactions_vtypes) new_rel = Relationship(pd_es["sessions"]["id"], pd_es["transactions"]["session_id"]) dask_rel = Relationship(dask_es["sessions"]["id"], dask_es["transactions"]["session_id"]) pd_es = pd_es.add_relationship(new_rel) dask_es = dask_es.add_relationship(dask_rel) assert pd_es['sessions'].last_time_index is None assert dask_es['sessions'].last_time_index is None pd_es.add_last_time_indexes() dask_es.add_last_time_indexes() pd.testing.assert_series_equal(pd_es['sessions'].last_time_index.sort_index(), dask_es['sessions'].last_time_index.compute(), check_names=False)
def make_ecommerce_entityset(with_integer_time_index=False): """ Makes a entityset with the following shape: R Regions / \\ . S C Stores, Customers | . S P Sessions, Products \\ / . L Log """ dataframes = make_ecommerce_dataframes( with_integer_time_index=with_integer_time_index) entities = dataframes.keys() es_id = 'ecommerce' if with_integer_time_index: es_id += "_int_time_index" variable_types = make_variable_types( with_integer_time_index=with_integer_time_index) time_indexes = make_time_indexes( with_integer_time_index=with_integer_time_index) es = EntitySet(id=es_id) for entity in entities: time_index = time_indexes.get(entity, None) ti_name = None secondary = None if time_index is not None: ti_name = time_index['name'] secondary = time_index['secondary'] df = dataframes[entity] es.entity_from_dataframe(entity, df, index='id', variable_types=variable_types[entity], time_index=ti_name, secondary_time_index=secondary) es.normalize_entity('customers', 'cohorts', 'cohort', additional_variables=['cohort_name'], make_time_index=True, new_entity_time_index='cohort_end') es.add_relationships([ Relationship(es[u'régions']['id'], es['customers'][u'région_id']), Relationship(es[u'régions']['id'], es['stores'][u'région_id']), Relationship(es['customers']['id'], es['sessions']['customer_id']), Relationship(es['sessions']['id'], es['log']['session_id']), Relationship(es['products']['id'], es['log']['product_id']) ]) return es
def test_add_relationship_errors_on_dtype_mismatch(es): log_2_df = es['log'].df.copy() log_variable_types = { 'id': variable_types.Categorical, 'session_id': variable_types.Id, 'product_id': variable_types.Id, 'datetime': variable_types.Datetime, 'value': variable_types.Numeric, 'value_2': variable_types.Numeric, 'latlong': variable_types.LatLong, 'latlong2': variable_types.LatLong, 'value_many_nans': variable_types.Numeric, 'priority_level': variable_types.Ordinal, 'purchased': variable_types.Boolean, 'comments': variable_types.Text } es.entity_from_dataframe(entity_id='log2', dataframe=log_2_df, index='id', variable_types=log_variable_types, time_index='datetime') error_text = u'Unable to add relationship because id in customers is Pandas dtype category and session_id in log2 is Pandas dtype int64.' with pytest.raises(ValueError, match=error_text): mismatch = Relationship(es[u'customers']['id'], es['log2']['session_id']) es.add_relationship(mismatch)
def datetime_es(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5], "card_id": [1, 1, 5, 1, 5], "transaction_time": pd.to_datetime([ '2011-2-28 04:00', '2012-2-28 05:00', '2012-2-29 06:00', '2012-3-1 08:00', '2014-4-1 10:00' ]), "fraud": [True, False, False, False, True] }) datetime_es = EntitySet(id="fraud_data") datetime_es = datetime_es.entity_from_dataframe( entity_id="transactions", dataframe=transactions_df, index="id", time_index="transaction_time") datetime_es = datetime_es.entity_from_dataframe(entity_id="cards", dataframe=cards_df, index="id") relationship = Relationship(datetime_es["cards"]["id"], datetime_es["transactions"]["card_id"]) datetime_es = datetime_es.add_relationship(relationship) datetime_es.add_last_time_indexes() return datetime_es
def test_operations_invalidate_metadata(es): new_es = EntitySet(id="test") # test metadata gets created on access assert new_es._data_description is None assert new_es.metadata is not None # generated after access assert new_es._data_description is not None if not isinstance(es['customers'].df, pd.DataFrame): customers_vtypes = es["customers"].variable_types customers_vtypes['signup_date'] = variable_types.Datetime else: customers_vtypes = None new_es.entity_from_dataframe("customers", es["customers"].df, index=es["customers"].index, variable_types=customers_vtypes) if not isinstance(es['sessions'].df, pd.DataFrame): sessions_vtypes = es["sessions"].variable_types else: sessions_vtypes = None new_es.entity_from_dataframe("sessions", es["sessions"].df, index=es["sessions"].index, variable_types=sessions_vtypes) assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None r = Relationship(new_es["customers"]["id"], new_es["sessions"]["customer_id"]) new_es = new_es.add_relationship(r) assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es = new_es.normalize_entity("customers", "cohort", "cohort") assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es.add_last_time_indexes() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None # automatically adding interesting values not supported in Dask or Koalas if any(isinstance(entity.df, pd.DataFrame) for entity in new_es.entities): new_es.add_interesting_values() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None
def test_add_parent_not_index_varible(entityset): with pytest.raises(AttributeError): entityset.add_relationship( Relationship(entityset[u'régions']['language'], entityset['customers'][u'région_id']))
def make_ecommerce_entityset(with_integer_time_index=False, base_path=None, save_files=True, file_location='local', split_by_time=False, compressed=False, entityset_type=EntitySet): if file_location == 'local' and save_files: filenames = make_ecommerce_files(with_integer_time_index, base_path=base_path, file_location=file_location, split_by_time=split_by_time, compressed=compressed) entities = filenames.keys() else: entities = [u'régions', 'stores', 'products', 'customers', 'sessions', 'log'] filenames = {e: entity_filename(e, base_path, file_location=file_location, glob=(split_by_time and e == 'log'), compressed=compressed) for e in entities} id = 'ecommerce' if with_integer_time_index: id += "_int_time_index" if split_by_time: id += "_glob" variable_types = make_variable_types( with_integer_time_index=with_integer_time_index) time_indexes = make_time_indexes( with_integer_time_index=with_integer_time_index) es = entityset_type(id=id) for entity in entities: time_index = time_indexes.get(entity, None) ti_name = None secondary = None if time_index is not None: ti_name = time_index['name'] secondary = time_index['secondary'] df = pd.read_csv(filenames[entity], encoding='utf-8') if entity == "customers": df["id"] = pd.Categorical(df['id']) if entity == 'sessions': # This should be changed back when converted to an EntitySet df['customer_id'] = pd.Categorical(df['customer_id']) if entity is 'log': df['latlong'] = df['latlong'].apply(latlong_unstringify) df['latlong2'] = df['latlong2'].apply(latlong_unstringify) es.entity_from_dataframe(entity, df, index='id', variable_types=variable_types[entity], encoding='utf-8', time_index=ti_name, secondary_time_index=secondary) es.normalize_entity('customers', 'cohorts', 'cohort', additional_variables=['cohort_name'], time_index_reduce='last', make_time_index=True, new_entity_time_index='cohort_end') es.add_relationships( [Relationship(es[u'régions']['id'], es['customers'][u'région_id']), Relationship(es[u'régions']['id'], es['stores'][u'région_id']), Relationship(es['customers']['id'], es['sessions']['customer_id']), Relationship(es['sessions']['id'], es['log']['session_id']), Relationship(es['products']['id'], es['log']['product_id'])]) return es
def test_add_parent_not_index_varible(es): error_text = "Parent variable.*is not the index of entity Entity.*" with pytest.raises(AttributeError, match=error_text): es.add_relationship(Relationship(es[u'régions']['language'], es['customers'][u'région_id']))
def make_ecommerce_entityset(with_integer_time_index=False, base_path=None, save_files=True, file_location='local', split_by_time=False, compressed=False, entityset_type=EntitySet): if file_location == 'local' and save_files: filenames = make_ecommerce_files(with_integer_time_index, base_path=base_path, file_location=file_location, split_by_time=split_by_time, compressed=compressed) entities = filenames.keys() else: entities = [ 'regions', 'stores', 'products', 'customers', 'sessions', 'log' ] filenames = { e: entity_filename(e, base_path, file_location=file_location, glob=(split_by_time and e == 'log'), compressed=compressed) for e in entities } id = 'ecommerce' if with_integer_time_index: id += "_int_time_index" if split_by_time: id += "_glob" variable_types = make_variable_types( with_integer_time_index=with_integer_time_index) time_indexes = make_time_indexes( with_integer_time_index=with_integer_time_index) es = entityset_type(id=id) for entity in entities: time_index = time_indexes.get(entity, None) ti_name = None secondary = None if time_index is not None: ti_name = time_index['name'] secondary = time_index['secondary'] es.entity_from_csv(entity, filenames[entity], index='id', variable_types=variable_types[entity], encoding='utf-8', time_index=ti_name, secondary_time_index=secondary) es.normalize_entity('customers', 'cohorts', 'cohort', additional_variables=['cohort_name'], time_index_reduce='last', make_time_index=True, new_entity_time_index='cohort_end') es.add_relationships([ Relationship(es['regions']['id'], es['customers']['region_id']), Relationship(es['regions']['id'], es['stores']['region_id']), Relationship(es['customers']['id'], es['sessions']['customer_id']), Relationship(es['sessions']['id'], es['log']['session_id']), Relationship(es['products']['id'], es['log']['product_id']) ]) return es