def test_accepts_cutoff_time_df(dataframes, relationships): cutoff_times_df = pd.DataFrame({ "instance_id": [1, 2, 3], "time": [10, 12, 15] }) feature_matrix, features = dfs( dataframes=dataframes, relationships=relationships, target_dataframe_name="transactions", cutoff_time=cutoff_times_df, ) feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True) assert len(feature_matrix.index) == 3 assert len(feature_matrix.columns) == len(features)
def test_replace_inf_values(divide_by_zero_es): div_by_scalar = DivideNumericScalar(value=0) div_by_feature = DivideByFeature(value=1) div_by_feature_neg = DivideByFeature(value=-1) for primitive in [ 'divide_numeric', div_by_scalar, div_by_feature, div_by_feature_neg ]: fm, _ = ft.dfs(entityset=divide_by_zero_es, target_dataframe_name='zero', trans_primitives=[primitive]) assert np.inf in to_pandas(fm).values or -np.inf in to_pandas( fm).values replaced_fm = replace_inf_values(fm) replaced_fm = to_pandas(replaced_fm) assert np.inf not in replaced_fm.values assert -np.inf not in replaced_fm.values custom_value_fm = replace_inf_values(fm, replacement_value='custom_val') custom_value_fm = to_pandas(custom_value_fm) assert np.inf not in custom_value_fm.values assert -np.inf not in replaced_fm.values assert 'custom_val' in custom_value_fm.values
def test_accepts_cutoff_time_compose(entities, relationships): def fraud_occured(df): return df['fraud'].any() lm = cp.LabelMaker(target_entity='card_id', time_index='transaction_time', labeling_function=fraud_occured, window_size=1) transactions_df = to_pandas(entities['transactions'][0]) labels = lm.search(transactions_df, num_examples_per_instance=-1) labels['time'] = pd.to_numeric(labels['time']) labels.rename({'card_id': 'id'}, axis=1, inplace=True) feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="cards", cutoff_time=labels) feature_matrix = to_pandas(feature_matrix, index='id') assert len(feature_matrix.index) == 6 assert len(feature_matrix.columns) == len(features) + 1
def test_with_features_built_from_es_metadata(es): metadata = es.metadata agg_feat = ft.Feature(metadata['log']['id'], parent_entity=metadata['customers'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index='id') v = df[agg_feat.get_name()].values[0] assert (v == 10)
def test_ignores_instance_ids_if_cutoff_df(entities, relationships): cutoff_times_df = pd.DataFrame({ "instance_id": [1, 2, 3], "time": [10, 12, 15] }) instance_ids = [1, 2, 3, 4, 5] feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=cutoff_times_df, instance_ids=instance_ids) feature_matrix = to_pandas(feature_matrix, index='id') assert len(feature_matrix.index) == 3 assert len(feature_matrix.columns) == len(features)
def lt(es): def label_func(df): return df['value'].sum() > 10 lm = cp.LabelMaker(target_entity='id', time_index='datetime', labeling_function=label_func, window_size='1m') df = es['log'].df df = to_pandas(df) labels = lm.search(df, num_examples_per_instance=-1) labels = labels.rename(columns={'cutoff_time': 'time'}) return labels
def test_parent(self, values_es, true_values_lti): # test entity with time index and all instances in child entity if not all( isinstance(entity.df, pd.DataFrame) for entity in values_es.entities): pytest.xfail( 'possible issue with either normalize_entity or add_last_time_indexes' ) values_es.add_last_time_indexes() values = values_es['values'] assert len(values.last_time_index) == 11 sorted_lti = to_pandas(values.last_time_index).sort_index() for v1, v2 in zip(sorted_lti, true_values_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_override_cmp_from_column(es): count_lo = ft.Feature(es['log'].ww['value']) > 1 to_test = [False, True, True] features = [count_lo] df = to_pandas(ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2]), index='id', sort_index=True) v = df[count_lo.get_name()].tolist() for i, test in enumerate(to_test): assert v[i] == test
def extra_session_df(es): row_values = { 'customer_id': 2, 'device_name': 'PC', 'device_type': 0, 'id': 6 } row = pd.DataFrame(row_values, index=pd.Index([6], name='id')) df = to_pandas(es['sessions'].df) df = df.append(row, sort=True).sort_index() if isinstance(es['sessions'].df, dd.DataFrame): df = dd.from_pandas(df, npartitions=3) if ks and isinstance(es['sessions'].df, ks.DataFrame): df = ks.from_pandas(df) return df
def test_eq(es): other_es = make_ecommerce_entityset() latlong = es['log'].df['latlong'].copy() assert es['log'].__eq__(es['log'], deep=True) assert es['log'].__eq__(other_es['log'], deep=True) assert all(to_pandas(es['log'].df['latlong']).eq(to_pandas(latlong))) other_es['log'].add_interesting_values() assert not es['log'].__eq__(other_es['log'], deep=True) es['log'].id = 'customers' es['log'].index = 'notid' assert not es['customers'].__eq__(es['log'], deep=True) es['log'].index = 'id' assert not es['customers'].__eq__(es['log'], deep=True) es['log'].time_index = 'signup_date' assert not es['customers'].__eq__(es['log'], deep=True) es['log'].secondary_time_index = { 'cancel_date': ['cancel_reason', 'cancel_date']} assert not es['customers'].__eq__(es['log'], deep=True)
def test_make_agg_feat_where_count(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], where=IdentityFeature( es['log']['product_id']) == 'coke zero', primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert (v == 3)
def test_use_previous_pd_dateoffset(es): total_events_pd = ft.Feature(es["log"]["id"], parent_entity=es["customers"], use_previous=pd.DateOffset(hours=47, minutes=60), primitive=Count) feature_matrix = ft.calculate_feature_matrix( [total_events_pd], es, cutoff_time=pd.Timestamp('2011-04-11 10:31:30'), instance_ids=[0, 1, 2]) feature_matrix = to_pandas(feature_matrix, index='id', sort_index=True) col_name = list(feature_matrix.head().keys())[0] assert (feature_matrix[col_name] == [1, 5, 2]).all()
def test_parent_no_time_index_missing(self, es, extra_session_df, true_sessions_lti): # test entity without time index and not all instance have children sessions = es['sessions'] # add session instance with no associated log instances sessions.update_data(extra_session_df) es.add_last_time_indexes() # since sessions has no time index, default value is NaT true_sessions_lti[6] = pd.NaT assert len(sessions.last_time_index) == 7 sorted_lti = to_pandas(sessions.last_time_index).sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_make_agg_feat_using_prev_time(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], use_previous=Timedelta(10, 's'), primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 10), feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert (v == 2) calculator = FeatureSetCalculator(es, time_last=datetime( 2011, 4, 9, 10, 30, 30), feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert (v == 1)
def test_with_features_built_from_es_metadata(es): metadata = es.metadata agg_feat = ft.Feature(metadata["log"].ww["id"], parent_dataframe_name="customers", primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index="id") v = df[agg_feat.get_name()].values[0] assert v == 10
def test_multiple_children_left_missing( self, es, extra_session_df, wishlist_df, true_sessions_lti ): if es.dataframe_type == Library.SPARK.value: pytest.xfail("Cannot make index on a Spark DataFrame") # add row to sessions so not all session instances are in log es.replace_dataframe(dataframe_name="sessions", df=extra_session_df) # add row to wishlist df so new session instance in in wishlist_log row_values = { "session_id": 6, "datetime": pd.Timestamp("2011-04-11 11:11:11"), "product_id": "toothpaste", } row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) if es.dataframe_type == Library.DASK.value: df = dd.from_pandas(df, npartitions=2) logical_types = { "session_id": Integer, "datetime": Datetime, "product_id": Categorical, } es.add_dataframe( dataframe_name="wishlist_log", dataframe=df, index="id", make_index=True, time_index="datetime", logical_types=logical_types, ) es.add_relationship("sessions", "id", "wishlist_log", "session_id") es.add_last_time_indexes() # test all instances in right child sessions = es["sessions"] # now wishlist_log has newer events for 3 session ids true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") lti_name = sessions.ww.metadata.get("last_time_index") assert len(sessions[lti_name]) == 7 sorted_lti = to_pandas(sessions[lti_name]).sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children_all_combined(self, es, extra_session_df, wishlist_df, true_sessions_lti): # test some instances in right, some in left, all when combined sessions = es['sessions'] # add row to sessions so not all session instances are in log sessions.update_data(extra_session_df) # add row to wishlist_log so extra session has child instance row_values = { 'session_id': 6, 'datetime': pd.Timestamp("2011-04-11 11:11:11"), 'product_id': 'toothpaste' } row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) # drop instance 4 so wishlist_log does not have session id 3 instance df.drop(4, inplace=True) if isinstance(es.entities[0].df, dd.DataFrame): df = dd.from_pandas(df, npartitions=2) if ks and isinstance(es.entities[0].df, ks.DataFrame): df = ks.from_pandas(df) variable_types = { 'id': ft.variable_types.variable.Index, 'session_id': ft.variable_types.variable.Numeric, 'datetime': ft.variable_types.variable.DatetimeTimeIndex, 'product_id': ft.variable_types.variable.Categorical } es.entity_from_dataframe(entity_id="wishlist_log", dataframe=df, index='id', make_index=True, time_index='datetime', variable_types=variable_types) relationship = Relationship(es['sessions']['id'], es['wishlist_log']['session_id']) es.add_relationship(relationship) es.add_last_time_indexes() # wishlist has newer events for 2 sessions true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") assert len(sessions.last_time_index) == 7 sorted_lti = to_pandas(sessions.last_time_index).sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_override_cmp(es): count = ft.Feature(es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count) _sum = ft.Feature(es["log"].ww["value"], parent_dataframe_name="sessions", primitive=Sum) gt_lo = count > 1 gt_other = count > _sum ge_lo = count >= 1 ge_other = count >= _sum lt_hi = count < 10 lt_other = count < _sum le_hi = count <= 10 le_other = count <= _sum ne_lo = count != 1 ne_other = count != _sum to_test = [ [True, True, False], [False, False, True], [True, True, True], [False, False, True], [True, True, True], [True, True, False], [True, True, True], [True, True, False], ] features = [ gt_lo, gt_other, ge_lo, ge_other, lt_hi, lt_other, le_hi, le_other, ne_lo, ne_other, ] df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2]) df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].tolist() assert v == test
def test_direct_from_identity(es): device = Feature(es['sessions'].ww['device_type']) d = DirectFeature(base_feature=device, child_dataframe_name='log') feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) df = to_pandas(df, index='id', sort_index=True) v = df[d.get_name()].tolist() if es.dataframe_type == Library.KOALAS.value: expected = ['0', '1'] else: expected = [0, 1] assert v == expected
def test_make_agg_feat_where_count(es): agg_feat = ft.Feature( es["log"].ww["id"], parent_dataframe_name="sessions", where=IdentityFeature(es["log"].ww["product_id"]) == "coke zero", primitive=Count, ) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert v == 3
def test_equal_categorical(simple_es): f1 = ft.Feature([ ft.IdentityFeature(simple_es['values'].ww['value']), ft.IdentityFeature(simple_es['values'].ww['value2']) ], primitive=Equal) df = ft.calculate_feature_matrix(entityset=simple_es, features=[f1]) if simple_es.dataframe_type != Library.KOALAS.value: # Koalas does not support categorical dtype assert set(simple_es['values']['value'].cat.categories) != \ set(simple_es['values']['value2'].cat.categories) assert to_pandas(df, index='id', sort_index=True)['value = value2'].to_list() == [ True, False, False, True ]
def test_direct_from_identity(es): device = Feature(es["sessions"].ww["device_type"]) d = DirectFeature(base_feature=device, child_dataframe_name="log") feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run(np.array([0, 5])) df = to_pandas(df, index="id", sort_index=True) v = df[d.get_name()].tolist() if es.dataframe_type == Library.SPARK.value: expected = ["0", "1"] else: expected = [0, 1] assert v == expected
def lt(es): def label_func(df): return df["value"].sum() > 10 lm = cp.LabelMaker( target_dataframe_name="id", time_index="datetime", labeling_function=label_func, window_size="1m", ) df = es["log"] df = to_pandas(df) labels = lm.search(df, num_examples_per_instance=-1) labels = labels.rename(columns={"cutoff_time": "time"}) return labels
def test_parent_no_time_index_missing(self, es, extra_session_df, true_sessions_lti): # test dataframe without time index and not all instance have children # add session instance with no associated log instances es.replace_dataframe(dataframe_name='sessions', df=extra_session_df) es.add_last_time_indexes() # since sessions has no time index, default value is NaT true_sessions_lti[6] = pd.NaT sessions = es['sessions'] lti_name = sessions.ww.metadata.get('last_time_index') assert len(sessions[lti_name]) == 7 sorted_lti = to_pandas(sessions[lti_name]).sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_equal_categorical(simple_es): f1 = ft.Feature( [simple_es['values']['value'], simple_es['values']['value2']], primitive=Equal) df = ft.calculate_feature_matrix(entityset=simple_es, features=[f1]) if all( isinstance(e.df, (pd.DataFrame, dd.DataFrame)) for e in simple_es.entities): # Koalas does not support categorical dtype assert set(simple_es['values'].df['value'].cat.categories) != \ set(simple_es['values'].df['value2'].cat.categories) assert to_pandas(df, index='id', sort_index=True)['value = value2'].to_list() == [ True, False, False, True ]
def test_multiple_children_all_combined(self, es, extra_session_df, wishlist_df, true_sessions_lti): if ks and isinstance(es.dataframes[0], ks.DataFrame): pytest.xfail('Cannot make index on a Koalas DataFrame') # add row to sessions so not all session instances are in log es.replace_dataframe(dataframe_name='sessions', df=extra_session_df) # add row to wishlist_log so extra session has child instance row_values = { 'session_id': 6, 'datetime': pd.Timestamp("2011-04-11 11:11:11"), 'product_id': 'toothpaste' } row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) # drop instance 4 so wishlist_log does not have session id 3 instance df.drop(4, inplace=True) if es.dataframe_type == Library.DASK.value: df = dd.from_pandas(df, npartitions=2) logical_types = { 'session_id': Integer, 'datetime': Datetime, 'product_id': Categorical } es.add_dataframe(dataframe_name="wishlist_log", dataframe=df, index='id', make_index=True, time_index='datetime', logical_types=logical_types) es.add_relationship('sessions', 'id', 'wishlist_log', 'session_id') es.add_last_time_indexes() # test some instances in right, some in left, all when combined sessions = es['sessions'] # wishlist has newer events for 2 sessions true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") lti_name = sessions.ww.metadata.get('last_time_index') assert len(sessions[lti_name]) == 7 sorted_lti = to_pandas(sessions[lti_name]).sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_make_agg_feat_of_agg_feat(es): log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) customer_sum_feat = ft.Feature(log_count_feat, parent_entity=es['customers'], primitive=Sum) feature_set = FeatureSet([customer_sum_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index='id') v = df[customer_sum_feat.get_name()].values[0] assert (v == 10)
def extra_session_df(es): row_values = { 'customer_id': 2, 'device_name': 'PC', 'device_type': 0, 'id': 6 } row = pd.DataFrame(row_values, index=pd.Index([6], name='id')) df = to_pandas(es['sessions']) df = df.append(row, sort=True).sort_index() if es.dataframe_type == Library.DASK.value: df = dd.from_pandas(df, npartitions=3) elif es.dataframe_type == Library.KOALAS.value: # Koalas can't handle object dtypes df = df.astype('string') df = ks.from_pandas(df) return df
def test_make_agg_feat_of_agg_feat(es): log_count_feat = ft.Feature(es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count) customer_sum_feat = ft.Feature(log_count_feat, parent_dataframe_name="customers", primitive=Sum) feature_set = FeatureSet([customer_sum_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index="id") v = df[customer_sum_feat.get_name()].values[0] assert v == 10
def test_override_boolean(es): count = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) count_lo = ft.Feature(count, primitive=GreaterThanScalar(1)) count_hi = ft.Feature(count, primitive=LessThanScalar(10)) to_test = [[True, True, True], [True, True, False], [False, False, True]] features = [] features.append(count_lo.OR(count_hi)) features.append(count_lo.AND(count_hi)) features.append(~(count_lo.AND(count_hi))) df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2]) df = to_pandas(df, index='id', sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test