def test_handles_datetime_format(): # check if we load according to the format string # pass in an ambigious date datetime_format = "%d-%m-%Y" actual = pd.Timestamp('Jan 2, 2011') time_strs = [actual.strftime(datetime_format)] * 3 df = pd.DataFrame( {'id': [0, 1, 2], 'time_format': time_strs, 'time_no_format': time_strs}) vtypes = {'id': variable_types.Categorical, 'time_format': (variable_types.Datetime, {"format": datetime_format}), 'time_no_format': variable_types.Datetime} entityset = EntitySet(id='test') entityset.entity_from_dataframe( entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df) col_format = entityset['test_entity'].df['time_format'] col_no_format = entityset['test_entity'].df['time_no_format'] # without formatting pandas gets it wrong assert (col_no_format != actual).all() # with formatting we correctly get jan2 assert (col_format == actual).all()
def test_converts_variable_type_after_init(): df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a'], 'ints': ['1', '2', '1']}) df["category"] = df["category"].astype("category") entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', dataframe=df) e = entityset['test_entity'] df = entityset['test_entity'].df e.convert_variable_type('ints', variable_types.Numeric) assert isinstance(e['ints'], variable_types.Numeric) assert df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics e.convert_variable_type('ints', variable_types.Categorical) assert isinstance(e['ints'], variable_types.Categorical) e.convert_variable_type('ints', variable_types.Ordinal) assert isinstance(e['ints'], variable_types.Ordinal) e.convert_variable_type('ints', variable_types.Boolean, true_val=1, false_val=2) assert isinstance(e['ints'], variable_types.Boolean) assert df['ints'].dtype.name == 'bool'
def test_check_variables_and_dataframe(): # matches df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) vtypes = {'id': variable_types.Categorical, 'category': variable_types.Categorical} entityset = EntitySet(id='test') entityset.entity_from_dataframe('test_entity', df, index='id', variable_types=vtypes) assert entityset.entity_dict['test_entity'].variable_types['category'] == variable_types.Categorical
def test_none_index(): df = pd.DataFrame({'category': [1, 2, 3], 'category2': ['1', '2', '3']}) vtypes = {'category': variable_types.Categorical, 'category2': variable_types.Categorical} entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', dataframe=df, variable_types=vtypes) assert entityset['test_entity'].index == 'category' assert isinstance(entityset['test_entity']['category'], variable_types.Index)
def test_bad_time_index_variable(): df = pd.DataFrame({'category': ['a', 'b', 'a']}) error_text = "Time index not found in dataframe" with pytest.raises(LookupError, match=error_text): entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index="id", dataframe=df, time_index='time')
def test_handles_datetime_mismatch(): # can't convert arbitrary strings df = pd.DataFrame({'id': [0, 1, 2], 'time': ['a', 'b', 'tomorrow']}) vtypes = {'id': variable_types.Categorical, 'time': variable_types.Datetime} with pytest.raises(ValueError): entityset = EntitySet(id='test') entityset.entity_from_dataframe('test_entity', df, 'id', time_index='time', variable_types=vtypes)
def test_unknown_index(): # more variables df = pd.DataFrame({'category': ['a', 'b', 'a']}) vtypes = {'category': variable_types.Categorical} entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df) assert entityset['test_entity'].index == 'id' assert entityset['test_entity'].df['id'].tolist() == list(range(3))
def test_doesnt_remake_index(): # more variables df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) error_text = "Cannot make index: index variable already present" with pytest.raises(RuntimeError, match=error_text): entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', make_index=True, dataframe=df)
def test_datetime64_conversion(): df = pd.DataFrame({'id': [0, 1, 2], 'ints': ['1', '2', '1']}) df["time"] = pd.Timestamp.now() df["time"] = df["time"].astype("datetime64[ns, UTC]") entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', dataframe=df) vtype_time_index = variable_types.variable.DatetimeTimeIndex entityset['test_entity'].convert_variable_type('time', vtype_time_index)
def test_extra_variable_type(): # more variables df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) vtypes = {'id': variable_types.Categorical, 'category': variable_types.Categorical, 'category2': variable_types.Categorical} with pytest.raises(LookupError): entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df)
def test_make_index_variable_ordering(): df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) vtypes = {'id': variable_types.Categorical, 'category': variable_types.Categorical} entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id1', make_index=True, variable_types=vtypes, dataframe=df) assert entityset.entity_dict['test_entity'].df.columns[0] == 'id1'
def test_make_time_index_keeps_original_sorting(): trips = { 'trip_id': [999 - i for i in range(1000)], 'flight_time': [datetime(1997, 4, 1) for i in range(1000)], 'flight_id': [1 for i in range(350)] + [2 for i in range(650)] } order = [i for i in range(1000)] df = pd.DataFrame.from_dict(trips) es = EntitySet('flights') es.entity_from_dataframe("trips", dataframe=df, index="trip_id", time_index='flight_time') assert (es['trips'].df['trip_id'] == order).all() es.normalize_entity(base_entity_id="trips", new_entity_id="flights", index="flight_id", make_time_index=True) assert (es['trips'].df['trip_id'] == order).all()
def test_converts_datetime(): # string converts to datetime correctly # This test fails without defining vtypes. Entityset # infers time column should be numeric type times = pd.date_range('1/1/2011', periods=3, freq='H') time_strs = times.strftime('%Y-%m-%d') df = pd.DataFrame({'id': [0, 1, 2], 'time': time_strs}) vtypes = {'id': variable_types.Categorical, 'time': variable_types.Datetime} entityset = EntitySet(id='test') entityset.entity_from_dataframe( entity_id='test_entity', index='id', time_index="time", variable_types=vtypes, dataframe=df) pd_col = entityset['test_entity'].df['time'] # assert type(entityset['test_entity']['time']) == variable_types.Datetime assert type(pd_col[0]) == pd.Timestamp
def test_converts_datetime(): # string converts to datetime correctly # This test fails without defining vtypes. Entityset # infers time column should be numeric type times = pd.date_range('1/1/2011', periods=3, freq='H') time_strs = times.strftime('%Y-%m-%d') df = pd.DataFrame({'id': [0, 1, 2], 'time': time_strs}) vtypes = { 'id': variable_types.Categorical, 'time': variable_types.Datetime } es = EntitySet(id='test') es.entity_from_dataframe(entity_id='test_entity', index='id', time_index="time", variable_types=vtypes, dataframe=df) pd_col = es['test_entity'].df['time'] # assert type(es['test_entity']['time']) == variable_types.Datetime assert type(pd_col[0]) == pd.Timestamp
def test_already_sorted_parameter(): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "transaction_time": [datetime(2014, 4, 6), datetime( 2012, 4, 8), datetime( 2012, 4, 8), datetime( 2013, 4, 8), datetime( 2015, 4, 8), datetime(2016, 4, 9)]}) es = EntitySet(id='test') es.entity_from_dataframe('t', transactions_df, index='id', time_index="transaction_time", already_sorted=True) times = es["t"].df.transaction_time.tolist() assert times == transactions_df.transaction_time.tolist()
def test_create_entity_from_dask_df(pd_es): dask_es = EntitySet(id="dask_es") log_dask = dd.from_pandas(pd_es["log"].df, npartitions=2) dask_es = dask_es.entity_from_dataframe( entity_id="log_dask", dataframe=log_dask, index="id", time_index="datetime", variable_types=pd_es["log"].variable_types) pd.testing.assert_frame_equal(pd_es["log"].df, dask_es["log_dask"].df.compute(), check_like=True)
def test_already_sorted_parameter(): transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5, 6], "transaction_time": [ datetime(2014, 4, 6), datetime(2012, 4, 8), datetime(2012, 4, 8), datetime(2013, 4, 8), datetime(2015, 4, 8), datetime(2016, 4, 9) ] }) es = EntitySet(id='test') es.entity_from_dataframe('t', transactions_df, index='id', time_index="transaction_time", already_sorted=True) times = es["t"].df.transaction_time.tolist() assert times == transactions_df.transaction_time.tolist()
def test_calculates_statistics_on_init(self): df = pd.DataFrame({'id': [0, 1, 2], 'time': [datetime(2011, 4, 9, 10, 31, 3 * i) for i in range(3)], 'category': ['a', 'b', 'a'], 'number': [4, 5, 6], 'boolean': [True, False, True], 'boolean_with_nan': [True, False, np.nan]}) vtypes = {'id': variable_types.Categorical, 'time': variable_types.Datetime, 'category': variable_types.Categorical, 'number': variable_types.Numeric, 'boolean': variable_types.Boolean, 'boolean_with_nan': variable_types.Boolean} entityset = EntitySet(id='test') entityset.entity_from_dataframe('stats_test_entity', df, 'id', variable_types=vtypes) e = entityset["stats_test_entity"] # numerics don't have nunique or percent_unique defined for v in ['time', 'category', 'number']: assert e[v].count == 3 for v in ['time', 'number']: with pytest.raises(AttributeError): e[v].nunique with pytest.raises(AttributeError): e[v].percent_unique # 'id' column automatically parsed as id assert e['id'].count == 3 # categoricals have nunique and percent_unique defined assert e['category'].nunique == 2 assert e['category'].percent_unique == 2. / 3 # booleans have count and number of true/false labels defined assert e['boolean'].count == 3 # assert e['boolean'].num_true == 3 assert e['boolean'].num_true == 2 assert e['boolean'].num_false == 1
def test_single_table_ks_entityset_dates_not_sorted(): ks_es = EntitySet(id="ks_es") df = pd.DataFrame({"id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25')]}) primitives_list = ['absolute', 'is_weekend', 'year', 'day'] values_dd = ks.from_pandas(df) vtypes = { "id": ft.variable_types.Id, "values": ft.variable_types.Numeric, "dates": ft.variable_types.Datetime, } ks_es.entity_from_dataframe(entity_id="data", dataframe=values_dd, index="id", time_index="dates", variable_types=vtypes) ks_fm, _ = ft.dfs(entityset=ks_es, target_entity="data", trans_primitives=primitives_list, max_depth=1) pd_es = ft.EntitySet(id="pd_es") pd_es.entity_from_dataframe(entity_id="data", dataframe=df, index="id", time_index="dates") fm, _ = ft.dfs(entityset=pd_es, target_entity="data", trans_primitives=primitives_list, max_depth=1) pd.testing.assert_frame_equal(fm, ks_fm.to_pandas().set_index('id').loc[fm.index])
def test_create_entity_from_ks_df(pd_es): cleaned_df = pd_to_ks_clean(pd_es["log"].df) log_ks = ks.from_pandas(cleaned_df) ks_es = EntitySet(id="ks_es") ks_es = ks_es.entity_from_dataframe( entity_id="log_ks", dataframe=log_ks, index="id", time_index="datetime", variable_types=pd_es["log"].variable_types ) pd.testing.assert_frame_equal(cleaned_df, ks_es["log_ks"].df.to_pandas(), check_like=True)
def test_converts_variable_types_on_init(): df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a'], 'category_int': [1, 2, 3], 'ints': ['1', '2', '3'], 'floats': ['1', '2', '3.0']}) df["category_int"] = df["category_int"].astype("category") vtypes = {'id': variable_types.Categorical, 'ints': variable_types.Numeric, 'floats': variable_types.Numeric} es = EntitySet(id='test') es.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df) entity_df = es['test_entity'].df assert entity_df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics assert entity_df['floats'].dtype.name in variable_types.PandasTypes._pandas_numerics # this is infer from pandas dtype e = es["test_entity"] assert isinstance(e['category_int'], variable_types.Categorical)
def test_converts_variable_types_on_init(): df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a'], 'category_int': [1, 2, 3], 'ints': ['1', '2', '3'], 'floats': ['1', '2', '3.0']}) df["category_int"] = df["category_int"].astype("category") vtypes = {'id': variable_types.Categorical, 'ints': variable_types.Numeric, 'floats': variable_types.Numeric} entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df) entity_df = entityset['test_entity'].df assert entity_df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics assert entity_df['floats'].dtype.name in variable_types.PandasTypes._pandas_numerics # this is infer from pandas dtype e = entityset["test_entity"] assert isinstance(e['category_int'], variable_types.Categorical)
def datetime_es(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5], "card_id": [1, 1, 5, 1, 5], "transaction_time": pd.to_datetime([ '2011-2-28 04:00', '2012-2-28 05:00', '2012-2-29 06:00', '2012-3-1 08:00', '2014-4-1 10:00']), "fraud": [True, False, False, False, True]}) datetime_es = EntitySet(id="fraud_data") datetime_es = datetime_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_df, index="id", time_index="transaction_time") datetime_es = datetime_es.entity_from_dataframe(entity_id="cards", dataframe=cards_df, index="id") relationship = Relationship(datetime_es["cards"]["id"], datetime_es["transactions"]["card_id"]) datetime_es = datetime_es.add_relationship(relationship) datetime_es.add_last_time_indexes() return datetime_es
def test_custom_variable_descriptions(): class ItemList(Categorical): type_string = "item_list" _default_pandas_dtype = list es = EntitySet() variables = { 'item_list': ItemList, 'time_index': TimeIndex, 'index': Index } dataframe = pd.DataFrame(columns=list(variables)) es.entity_from_dataframe('custom_variable', dataframe, index='index', time_index='time_index', variable_types=variables) entity = es['custom_variable'] for variable in entity.variables: description = variable.to_data_description() _variable = deserialize.description_to_variable(description, entity=entity) assert variable.__eq__(_variable)
def test_sets_time_when_adding_entity(): transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True] }) accounts_df = pd.DataFrame({ "id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)] }) accounts_df_string = pd.DataFrame({ "id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"] }) # create empty entityset es = EntitySet("fraud") # assert it's not set assert getattr(es, "time_type", None) is None # add entity es.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert es.time_type == variable_types.NumericTimeIndex # add another entity es.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert es.time_type == variable_types.NumericTimeIndex # add wrong time type entity error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error_text): es.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index error_text = "Attempted to convert all string column signup_date to numeric" with pytest.raises(TypeError, match=error_text): es.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_sets_time_when_adding_entity(self): transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True] }) accounts_df = pd.DataFrame({ "id": [3, 4, 5], "signup_date": [ datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11) ] }) accounts_df_string = pd.DataFrame({ "id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"] }) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_sets_time_when_adding_entity(): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True]}) accounts_df = pd.DataFrame({"id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)]}) accounts_df_string = pd.DataFrame({"id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"]}) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error_text): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index error_text = "Attempted to convert all string column signup_date to numeric" with pytest.raises(TypeError, match=error_text): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_sets_time_when_adding_entity(self): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True]}) accounts_df = pd.DataFrame({"id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)]}) accounts_df_string = pd.DataFrame({"id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"]}) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_single_table_ks_entityset_cutoff_time_df(): primitives_list = [ 'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words' ] ks_es = EntitySet(id="ks_es") df = pd.DataFrame({ "id": [0, 1, 2], "values": [1, 12, -34], "dates": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01') ], "strings": ["I am a string", "23", "abcdef ghijk"] }) values_dd = ks.from_pandas(df) vtypes = { "id": ft.variable_types.Id, "values": ft.variable_types.Numeric, "dates": ft.variable_types.Datetime, "strings": ft.variable_types.NaturalLanguage } ks_es.entity_from_dataframe(entity_id="data", dataframe=values_dd, index="id", time_index="dates", variable_types=vtypes) ids = [0, 1, 2, 0] times = [ pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-15 04:00") ] labels = [True, False, True, False] cutoff_times = pd.DataFrame({ "id": ids, "time": times, "labels": labels }, columns=["id", "time", "labels"]) ks_fm, _ = ft.dfs(entityset=ks_es, target_entity="data", trans_primitives=primitives_list, cutoff_time=cutoff_times) pd_es = ft.EntitySet(id="pd_es") pd_es.entity_from_dataframe( entity_id="data", dataframe=df, index="id", time_index="dates", variable_types={"strings": ft.variable_types.NaturalLanguage}) fm, _ = ft.dfs(entityset=pd_es, target_entity="data", trans_primitives=primitives_list, cutoff_time=cutoff_times) # Because row ordering with koalas is not guaranteed, `we need to sort on two columns to make sure that values # for instance id 0 are compared correctly. Also, make sure the boolean column has the same dtype. fm = fm.sort_values(['id', 'labels']) ks_fm = ks_fm.to_pandas().set_index('id').sort_values(['id', 'labels']) ks_fm['IS_WEEKEND(dates)'] = ks_fm['IS_WEEKEND(dates)'].astype( fm['IS_WEEKEND(dates)'].dtype) pd.testing.assert_frame_equal(fm, ks_fm)
def test_add_last_time_indexes(): pd_es = EntitySet(id="pd_es") dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({ "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_vtypes = { "id": ft.variable_types.Id, "user": ft.variable_types.Id, "time": ft.variable_types.DatetimeTimeIndex, "strings": ft.variable_types.NaturalLanguage } transactions = pd.DataFrame({ "id": [0, 1, 2, 3, 4, 5], "session_id": [0, 0, 1, 2, 2, 3], "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], "time": [ pd.to_datetime('2019-01-10 03:53'), pd.to_datetime('2019-01-10 04:12'), pd.to_datetime('2019-02-03 10:34'), pd.to_datetime('2019-01-01 12:35'), pd.to_datetime('2019-01-01 12:49'), pd.to_datetime('2017-08-25 04:53') ] }) transactions_dask = dd.from_pandas(transactions, npartitions=2) transactions_vtypes = { "id": ft.variable_types.Id, "session_id": ft.variable_types.Id, "amount": ft.variable_types.Numeric, "time": ft.variable_types.DatetimeTimeIndex, } pd_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions, index="id", time_index="time") dask_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions_dask, index="id", time_index="time", variable_types=sessions_vtypes) pd_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions, index="id", time_index="time") dask_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_dask, index="id", time_index="time", variable_types=transactions_vtypes) new_rel = Relationship(pd_es["sessions"]["id"], pd_es["transactions"]["session_id"]) dask_rel = Relationship(dask_es["sessions"]["id"], dask_es["transactions"]["session_id"]) pd_es = pd_es.add_relationship(new_rel) dask_es = dask_es.add_relationship(dask_rel) assert pd_es['sessions'].last_time_index is None assert dask_es['sessions'].last_time_index is None pd_es.add_last_time_indexes() dask_es.add_last_time_indexes() pd.testing.assert_series_equal( pd_es['sessions'].last_time_index.sort_index(), dask_es['sessions'].last_time_index.compute(), check_names=False)