def test_dfs_empty_features(): error_text = "No features can be generated from the specified primitives. Please make sure the primitives you are using are compatible with the variable types in your data." teams = pd.DataFrame({"id": range(3), "name": ["Breakers", "Spirit", "Thorns"]}) games = pd.DataFrame( { "id": range(5), "home_team_id": [2, 2, 1, 0, 1], "away_team_id": [1, 0, 2, 1, 0], "home_team_score": [3, 0, 1, 0, 4], "away_team_score": [2, 1, 2, 0, 0], } ) dataframes = { "teams": (teams, "id", None, {"name": "natural_language"}), "games": (games, "id"), } relationships = [("teams", "id", "games", "home_team_id")] with patch.object(DeepFeatureSynthesis, "build_features", return_value=[]): features = dfs( dataframes, relationships, target_dataframe_name="teams", features_only=True ) assert features == [] with pytest.raises(AssertionError, match=error_text), patch.object( DeepFeatureSynthesis, "build_features", return_value=[] ): dfs( dataframes, relationships, target_dataframe_name="teams", features_only=False, )
def test_no_warns_with_camel_and_title_case(es): for trans_primitive in ["isNull", "IsNull"]: # Should not raise a UnusedPrimitiveWarning warning with pytest.warns(None) as record: dfs( entityset=es, target_dataframe_name="customers", trans_primitives=[trans_primitive], max_depth=1, features_only=True, ) assert not record for agg_primitive in ["numUnique", "NumUnique"]: # Should not raise a UnusedPrimitiveWarning warning with pytest.warns(None) as record: dfs( entityset=es, target_dataframe_name="customers", agg_primitives=[agg_primitive], max_depth=2, features_only=True, ) assert not record
def test_warns_with_unused_primitives(es): if es.dataframe_type == Library.KOALAS.value: pytest.skip('Koalas throws extra warnings') trans_primitives = ['num_characters', 'num_words', 'add_numeric'] agg_primitives = [Max, 'min'] warning_text = "Some specified primitives were not used during DFS:\n" + \ " trans_primitives: ['add_numeric']\n agg_primitives: ['max', 'min']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible columns for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=es, target_dataframe_name='customers', trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=1, features_only=True) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs(entityset=es, target_dataframe_name='customers', trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=2, features_only=True) assert not record
def test_calls_progress_callback(entities, relationships): class MockProgressCallback: def __init__(self): self.total_update = 0 self.total_progress_percent = 0 def __call__(self, update, progress_percent, time_elapsed): self.total_update += update self.total_progress_percent = progress_percent mock_progress_callback = MockProgressCallback() feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", progress_callback=mock_progress_callback) assert np.isclose(mock_progress_callback.total_update, 100.0) assert np.isclose(mock_progress_callback.total_progress_percent, 100.0) # test with multiple jobs mock_progress_callback = MockProgressCallback() with cluster() as (scheduler, [a, b]): dkwargs = {'cluster': scheduler['address']} feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", progress_callback=mock_progress_callback, dask_kwargs=dkwargs) assert np.isclose(mock_progress_callback.total_update, 100.0) assert np.isclose(mock_progress_callback.total_progress_percent, 100.0)
def test_calls_progress_callback_cluster(pd_dataframes, relationships, dask_cluster): class MockProgressCallback: def __init__(self): self.progress_history = [] self.total_update = 0 self.total_progress_percent = 0 def __call__(self, update, progress_percent, time_elapsed): self.total_update += update self.total_progress_percent = progress_percent self.progress_history.append(progress_percent) mock_progress_callback = MockProgressCallback() dkwargs = {"cluster": dask_cluster.scheduler.address} dfs( dataframes=pd_dataframes, relationships=relationships, target_dataframe_name="transactions", progress_callback=mock_progress_callback, dask_kwargs=dkwargs, ) assert np.isclose(mock_progress_callback.total_update, 100.0) assert np.isclose(mock_progress_callback.total_progress_percent, 100.0)
def test_calls_progress_callback(dataframes, relationships): class MockProgressCallback: def __init__(self): self.progress_history = [] self.total_update = 0 self.total_progress_percent = 0 def __call__(self, update, progress_percent, time_elapsed): self.total_update += update self.total_progress_percent = progress_percent self.progress_history.append(progress_percent) mock_progress_callback = MockProgressCallback() dfs( dataframes=dataframes, relationships=relationships, target_dataframe_name="transactions", progress_callback=mock_progress_callback, ) # second to last entry is the last update from feature calculation assert np.isclose( mock_progress_callback.progress_history[-2], FEATURE_CALCULATION_PERCENTAGE * 100, ) assert np.isclose(mock_progress_callback.total_update, 100.0) assert np.isclose(mock_progress_callback.total_progress_percent, 100.0)
def test_warns_with_unused_groupby_primitives(pd_es): warning_text = ( "Some specified primitives were not used during DFS:\n" + " groupby_trans_primitives: ['cum_sum']\n" + "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + "or it may indicate no compatible columns for the primitive were found in the data. If the DFS call " + "contained multiple instances of a primitive in the list above, none of them were used." ) with pytest.warns(UnusedPrimitiveWarning) as record: dfs( entityset=pd_es, target_dataframe_name="sessions", groupby_trans_primitives=["cum_sum"], max_depth=1, features_only=True, ) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs( entityset=pd_es, target_dataframe_name="customers", groupby_trans_primitives=["cum_sum"], max_depth=1, features_only=True, ) assert not record
def test_warns_with_unused_primitives(es): if ks and any(isinstance(e.df, ks.DataFrame) for e in es.entities): pytest.skip('Koalas throws extra warnings') trans_primitives = ['num_characters', 'num_words', 'add_numeric'] agg_primitives = [Max, 'min'] warning_text = "Some specified primitives were not used during DFS:\n" + \ " trans_primitives: ['add_numeric']\n agg_primitives: ['max', 'min']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible variable types for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=es, target_entity='customers', trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=1) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs(entityset=es, target_entity='customers', trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=2) assert not record
def test_dask_kwargs(pd_dataframes, relationships, dask_cluster): cutoff_times_df = pd.DataFrame({ "instance_id": [1, 2, 3], "time": [10, 12, 15] }) feature_matrix, features = dfs( dataframes=pd_dataframes, relationships=relationships, target_dataframe_name="transactions", cutoff_time=cutoff_times_df, ) dask_kwargs = {"cluster": dask_cluster.scheduler.address} feature_matrix_2, features_2 = dfs( dataframes=pd_dataframes, relationships=relationships, target_dataframe_name="transactions", cutoff_time=cutoff_times_df, dask_kwargs=dask_kwargs, ) assert all(f1.unique_name() == f2.unique_name() for f1, f2 in zip(features, features_2)) for column in feature_matrix: for x, y in zip(feature_matrix[column], feature_matrix_2[column]): assert (pd.isnull(x) and pd.isnull(y)) or (x == y)
def test_accepts_relative_training_window(datetime_es): feature_matrix, features = dfs(entityset=datetime_es, target_entity="transactions") feature_matrix_2, features_2 = dfs(entityset=datetime_es, target_entity="transactions", cutoff_time=pd.Timestamp("2012-4-1 04:00")) feature_matrix_3, features_3 = dfs(entityset=datetime_es, target_entity="transactions", cutoff_time=pd.Timestamp("2012-4-1 04:00"), training_window=Timedelta("3 months")) feature_matrix_4, features_4 = dfs(entityset=datetime_es, target_entity="transactions", cutoff_time=pd.Timestamp("2012-4-1 04:00"), training_window="3 months") # Test case for leap years feature_matrix_5, features_5 = dfs(entityset=datetime_es, target_entity="transactions", cutoff_time=pd.Timestamp("2012-2-29 04:00"), training_window=Timedelta("1 year")) assert (feature_matrix.index == [1, 2, 3, 4, 5]).all() assert (feature_matrix_2.index == [1, 2, 3, 4]).all() assert (feature_matrix_3.index == [2, 3, 4]).all() assert (feature_matrix_4.index == [2, 3, 4]).all() assert (feature_matrix_5.index == [1, 2]).all()
def fit(self, cuttof_time_ids, y=None): """Wrapper for DFS Calculates a feature matrix and features given a dictionary of entities and a list of relationships. Args: cuttof_time_ids (list | DataFrame): Instances filtered to calculate features on. See Also: :func:`synthesis.dfs` """ if isinstance(cuttof_time_ids, (list, np.ndarray, pd.Series)): self.feature_defs = dfs(entities=self.entities, relationships=self.relationships, entityset=self.entityset, target_entity=self.target_entity, instance_ids=cuttof_time_ids, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, allowed_paths=self.allowed_paths, max_depth=self.max_depth, ignore_entities=self.ignore_entities, ignore_variables=self.ignore_variables, seed_features=self.seed_features, drop_contains=self.drop_contains, drop_exact=self.drop_exact, where_primitives=self.where_primitives, max_features=self.max_features, features_only=True, verbose=self.verbose) elif isinstance(cuttof_time_ids, pd.DataFrame): self.feature_defs = dfs(entities=self.entities, relationships=self.relationships, entityset=self.entityset, target_entity=self.target_entity, cutoff_time=cuttof_time_ids, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, allowed_paths=self.allowed_paths, max_depth=self.max_depth, ignore_entities=self.ignore_entities, ignore_variables=self.ignore_variables, seed_features=self.seed_features, drop_contains=self.drop_contains, drop_exact=self.drop_exact, where_primitives=self.where_primitives, max_features=self.max_features, features_only=True, verbose=self.verbose) else: raise TypeError( 'instance_ids must be a list, np.ndarray, pd.Series, or pd.DataFrame' ) return self
def fit(self, cuttof_time_ids, y=None): """Wrapper for DFS Calculates a feature matrix and features given a dictionary of entities and a list of relationships. Args: cuttof_time_ids (list | DataFrame): Instances filtered to calculate features on. See Also: :func:`synthesis.dfs` """ if isinstance(cuttof_time_ids, (list, np.ndarray, pd.Series)): self.feature_defs = dfs(entities=self.entities, relationships=self.relationships, entityset=self.entityset, target_entity=self.target_entity, instance_ids=cuttof_time_ids, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, allowed_paths=self.allowed_paths, max_depth=self.max_depth, ignore_entities=self.ignore_entities, ignore_variables=self.ignore_variables, seed_features=self.seed_features, drop_contains=self.drop_contains, drop_exact=self.drop_exact, where_primitives=self.where_primitives, max_features=self.max_features, features_only=True, verbose=self.verbose) elif isinstance(cuttof_time_ids, pd.DataFrame): self.feature_defs = dfs(entities=self.entities, relationships=self.relationships, entityset=self.entityset, target_entity=self.target_entity, cutoff_time=cuttof_time_ids, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, allowed_paths=self.allowed_paths, max_depth=self.max_depth, ignore_entities=self.ignore_entities, ignore_variables=self.ignore_variables, seed_features=self.seed_features, drop_contains=self.drop_contains, drop_exact=self.drop_exact, where_primitives=self.where_primitives, max_features=self.max_features, features_only=True, verbose=self.verbose) else: raise TypeError('instance_ids must be a list, np.ndarray, pd.Series, or pd.DataFrame') return self
def test_groupby_with_multioutput_primitive(pd_es): class MultiCumSum(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 3 def get_function(self): def multi_cum_sum(x): return x.cumsum(), x.cummax(), x.cummin() return multi_cum_sum fm, _ = dfs( entityset=pd_es, target_dataframe_name="customers", trans_primitives=[], agg_primitives=[], groupby_trans_primitives=[MultiCumSum, CumSum, CumMax, CumMin], ) # Calculate output in a separate DFS call to make sure the multi-output code # does not alter any values fm2, _ = dfs( entityset=pd_es, target_dataframe_name="customers", trans_primitives=[], agg_primitives=[], groupby_trans_primitives=[CumSum, CumMax, CumMin], ) answer_cols = [ ["CUM_SUM(age) by cohort", "CUM_SUM(age) by région_id"], ["CUM_MAX(age) by cohort", "CUM_MAX(age) by région_id"], ["CUM_MIN(age) by cohort", "CUM_MIN(age) by région_id"], ] for i in range(3): # Check that multi-output gives correct answers f = "MULTI_CUM_SUM(age)[%d] by cohort" % i assert f in fm.columns for x, y in zip(fm[f].values, fm[answer_cols[i][0]].values): assert x == y f = "MULTI_CUM_SUM(age)[%d] by région_id" % i assert f in fm.columns for x, y in zip(fm[f].values, fm[answer_cols[i][1]].values): assert x == y # Verify single output results are unchanged by inclusion of # multi-output primitive for x, y in zip(fm[answer_cols[i][0]], fm2[answer_cols[i][0]]): assert x == y for x, y in zip(fm[answer_cols[i][1]], fm2[answer_cols[i][1]]): assert x == y
def test_warns_cutoff_time_dask(dataframes, relationships): cutoff_times_df = pd.DataFrame({ "instance_id": [1, 2, 3], "time": [10, 12, 15] }) cutoff_times_df = dd.from_pandas(cutoff_times_df, npartitions=2) match = "cutoff_time should be a Pandas DataFrame: " \ "computing cutoff_time, this may take a while" with pytest.warns(UserWarning, match=match): dfs(dataframes=dataframes, relationships=relationships, target_dataframe_name="transactions", cutoff_time=cutoff_times_df)
def test_accepts_pd_dateoffset_training_window(datetime_es): # TODO: Update to use Dask dataframes when issue #882 is closed feature_matrix, _ = dfs(entityset=datetime_es, target_dataframe_name="transactions", cutoff_time=pd.Timestamp("2012-3-31 04:00"), training_window=pd.DateOffset(months=2)) feature_matrix_2, _ = dfs(entityset=datetime_es, target_dataframe_name="transactions", cutoff_time=pd.Timestamp("2012-3-31 04:00"), training_window=pd.offsets.BDay(44)) assert (feature_matrix.index == [2, 3, 4]).all() assert (feature_matrix.index == feature_matrix_2.index).all()
def test_accepts_pd_dateoffset_training_window(datetime_es): feature_matrix, features = dfs(entityset=datetime_es, target_entity="transactions", cutoff_time=pd.Timestamp("2012-3-31 04:00"), training_window=pd.DateOffset(months=2)) feature_matrix_2, features_2 = dfs( entityset=datetime_es, target_entity="transactions", cutoff_time=pd.Timestamp("2012-3-31 04:00"), training_window=pd.offsets.BDay(44)) assert (feature_matrix.index == [2, 3, 4]).all() assert (feature_matrix.index == feature_matrix_2.index).all()
def test_does_not_warn_with_stacking_feature(pd_es): with pytest.warns(None) as record: dfs(entityset=pd_es, target_entity='régions', agg_primitives=['percent_true'], trans_primitives=[GreaterThanScalar(5)], primitive_options={ 'greater_than_scalar': { 'include_entities': ['stores'] } }, features_only=True) assert not record
def test_does_not_warn_with_stacking_feature(pd_es): with pytest.warns(None) as record: dfs( entityset=pd_es, target_dataframe_name="régions", agg_primitives=["percent_true"], trans_primitives=[GreaterThanScalar(5)], primitive_options={ "greater_than_scalar": {"include_dataframes": ["stores"]} }, features_only=True, ) assert not record
def test_warns_with_unused_where_primitives(es): warning_text = "Some specified primitives were not used during DFS:\n" + \ " where_primitives: ['count', 'sum']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible variable types for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=es, target_entity='customers', agg_primitives=['count'], where_primitives=['sum', 'count'], max_depth=1) assert record[0].message.args[0] == warning_text
def test_warns_with_unused_custom_primitives(pd_es): def above_ten(column): return column > 10 AboveTen = make_trans_primitive(function=above_ten, input_types=[Numeric], return_type=Numeric) trans_primitives = [AboveTen] warning_text = "Some specified primitives were not used during DFS:\n" + \ " trans_primitives: ['above_ten']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible variable types for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=pd_es, target_entity='sessions', trans_primitives=trans_primitives, max_depth=1) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs(entityset=pd_es, target_entity='customers', trans_primitives=trans_primitives, max_depth=1) def max_above_ten(column): return max(column) > 10 MaxAboveTen = make_agg_primitive(function=max_above_ten, input_types=[Numeric], return_type=Numeric) agg_primitives = [MaxAboveTen] warning_text = "Some specified primitives were not used during DFS:\n" + \ " agg_primitives: ['max_above_ten']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible variable types for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=pd_es, target_entity='stores', agg_primitives=agg_primitives, max_depth=1) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs(entityset=pd_es, target_entity='sessions', agg_primitives=agg_primitives, max_depth=1)
def test_accepts_cutoff_time_compose(entities, relationships): def fraud_occured(df): return df['fraud'].any() lm = cp.LabelMaker(target_entity='card_id', time_index='transaction_time', labeling_function=fraud_occured, window_size=1) transactions_df = entities['transactions'][0] if isinstance(transactions_df, dd.DataFrame): transactions_df = transactions_df.compute() labels = lm.search(transactions_df, num_examples_per_instance=-1) labels['time'] = pd.to_numeric(labels['time']) labels.rename({'card_id': 'id'}, axis=1, inplace=True) feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="cards", cutoff_time=labels) if isinstance(feature_matrix, dd.DataFrame): feature_matrix = feature_matrix.compute().set_index('id') assert len(feature_matrix.index) == 6 assert len(feature_matrix.columns) == len(features) + 1
def test_direct_of_multi_output_transform_feat(es): class TestTime(TransformPrimitive): name = "test_time" input_types = [Datetime] return_type = Numeric number_output_features = 6 def get_function(self): def test_f(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] return test_f join_time_split = Feature(es["customers"]["signup_date"], primitive=TestTime) alt_features = [Feature(es["customers"]["signup_date"], primitive=Year), Feature(es["customers"]["signup_date"], primitive=Month), Feature(es["customers"]["signup_date"], primitive=Day), Feature(es["customers"]["signup_date"], primitive=Hour), Feature(es["customers"]["signup_date"], primitive=Minute), Feature(es["customers"]["signup_date"], primitive=Second)] fm, fl = dfs( entityset=es, target_entity="sessions", trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second]) # Get column names of for multi feature and normal features subnames = DirectFeature(join_time_split, es["sessions"]).get_feature_names() altnames = [DirectFeature(f, es["sessions"]).get_name() for f in alt_features] # Check values are equal between for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all()
def test_accepts_single_cutoff_time(entities, relationships): feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=20) assert len(feature_matrix.index) == 5 assert len(feature_matrix.columns) == len(features)
def test_accepts_pandas_training_window(datetime_es): feature_matrix, features = dfs(entityset=datetime_es, target_entity="transactions", cutoff_time=pd.Timestamp("2012-4-1 04:00"), training_window=pd.Timedelta(90, "D")) assert (feature_matrix.index == [2, 3, 4]).all()
def test_accepts_no_cutoff_time(entities, relationships): feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", instance_ids=[1, 2, 3, 5, 6]) assert len(feature_matrix.index) == 5 assert len(feature_matrix.columns) == len(features)
def test_passing_strings_to_variable_types_dfs(): variable_types = find_variable_types() teams = pd.DataFrame({ 'id': range(3), 'name': ['Breakers', 'Spirit', 'Thorns'] }) games = pd.DataFrame({ 'id': range(5), 'home_team_id': [2, 2, 1, 0, 1], 'away_team_id': [1, 0, 2, 1, 0], 'home_team_score': [3, 0, 1, 0, 4], 'away_team_score': [2, 1, 2, 0, 0] }) entities = { 'teams': (teams, 'id', None, { 'name': 'natural_language' }), 'games': (games, 'id') } relationships = [('teams', 'id', 'games', 'home_team_id')] features = dfs(entities, relationships, target_entity="teams", features_only=True) name_class = features[0].entity['name'].__class__ assert name_class == variable_types['natural_language']
def test_accepts_cutoff_time_compose(dataframes, relationships): def fraud_occured(df): return df["fraud"].any() lm = cp.LabelMaker( target_dataframe_name="card_id", time_index="transaction_time", labeling_function=fraud_occured, window_size=1, ) transactions_df = to_pandas(dataframes["transactions"][0]) labels = lm.search(transactions_df, num_examples_per_instance=-1) labels["time"] = pd.to_numeric(labels["time"]) labels.rename({"card_id": "id"}, axis=1, inplace=True) feature_matrix, features = dfs( dataframes=dataframes, relationships=relationships, target_dataframe_name="cards", cutoff_time=labels, ) feature_matrix = to_pandas(feature_matrix, index="id") assert len(feature_matrix.index) == 6 assert len(feature_matrix.columns) == len(features) + 1
def test_passing_strings_to_logical_types_dfs(): teams = pd.DataFrame({ "id": range(3), "name": ["Breakers", "Spirit", "Thorns"] }) games = pd.DataFrame({ "id": range(5), "home_team_id": [2, 2, 1, 0, 1], "away_team_id": [1, 0, 2, 1, 0], "home_team_score": [3, 0, 1, 0, 4], "away_team_score": [2, 1, 2, 0, 0], }) dataframes = { "teams": (teams, "id", None, { "name": "natural_language" }), "games": (games, "id"), } relationships = [("teams", "id", "games", "home_team_id")] features = dfs(dataframes, relationships, target_dataframe_name="teams", features_only=True) name_logical_type = features[0].dataframe["name"].ww.logical_type assert isinstance(name_logical_type, NaturalLanguage)
def test_passing_strings_to_logical_types_dfs(): teams = pd.DataFrame({ 'id': range(3), 'name': ['Breakers', 'Spirit', 'Thorns'] }) games = pd.DataFrame({ 'id': range(5), 'home_team_id': [2, 2, 1, 0, 1], 'away_team_id': [1, 0, 2, 1, 0], 'home_team_score': [3, 0, 1, 0, 4], 'away_team_score': [2, 1, 2, 0, 0] }) dataframes = { 'teams': (teams, 'id', None, { 'name': 'natural_language' }), 'games': (games, 'id') } relationships = [('teams', 'id', 'games', 'home_team_id')] features = dfs(dataframes, relationships, target_dataframe_name="teams", features_only=True) name_logical_type = features[0].dataframe['name'].ww.logical_type assert isinstance(name_logical_type, NaturalLanguage)
def test_accepts_single_cutoff_time(entities, relationships): feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=20) assert len(feature_matrix.index) == 6 assert len(feature_matrix.columns) == len(features)
def test_direct_features_of_multi_output_agg_primitives(pd_es): class ThreeMostCommonCat(AggregationPrimitive): name = "n_most_common_categorical" input_types = [ColumnSchema(semantic_tags={"category"})] return_type = ColumnSchema(semantic_tags={"category"}) number_output_features = 3 def get_function(self, agg_type="pandas"): def pd_top3(x): counts = x.value_counts() counts = counts[counts > 0] array = np.array(counts.index[:3]) if len(array) < 3: filler = np.full(3 - len(array), np.nan) array = np.append(array, filler) return array return pd_top3 fm, fl = dfs( entityset=pd_es, target_dataframe_name="log", agg_primitives=[ThreeMostCommonCat], trans_primitives=[], max_depth=3, ) has_nmost_as_base = [] for feature in fl: is_base = False if len(feature.base_features) > 0 and isinstance( feature.base_features[0].primitive, ThreeMostCommonCat): is_base = True has_nmost_as_base.append(is_base) assert any(has_nmost_as_base) true_result_rows = [] session_data = { 0: ["coke zero", "car", np.nan], 1: ["toothpaste", "brown bag", np.nan], 2: ["brown bag", np.nan, np.nan], 3: set(["Haribo sugar-free gummy bears", "coke zero", np.nan]), 4: ["coke zero", np.nan, np.nan], 5: ["taco clock", np.nan, np.nan], } for i, count in enumerate([5, 4, 1, 2, 3, 2]): while count > 0: true_result_rows.append(session_data[i]) count -= 1 tempname = "sessions.N_MOST_COMMON_CATEGORICAL(log.product_id)[%s]" for i, row in enumerate(true_result_rows): for j in range(3): value = fm[tempname % (j)][i] if isinstance(row, set): assert pd.isnull(value) or value in row else: assert (pd.isnull(value) and pd.isnull(row[j])) or value == row[j]
def test_accepts_pd_timedelta_training_window(datetime_es): # TODO: Update to use Dask entities when issue #882 is closed feature_matrix, features = dfs(entityset=datetime_es, target_entity="transactions", cutoff_time=pd.Timestamp("2012-3-31 04:00"), training_window=pd.Timedelta(61, "D")) assert (feature_matrix.index == [2, 3, 4]).all()
def test_accepts_cutoff_time_df(entities, relationships): cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3], "time": [10, 12, 15]}) feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=cutoff_times_df) assert len(feature_matrix.index) == 3 assert len(feature_matrix.columns) == len(features)
def test_dask_kwargs(entities, relationships): cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3], "time": [10, 12, 15]}) feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=cutoff_times_df) with cluster() as (scheduler, [a, b]): dask_kwargs = {'cluster': scheduler['address']} feature_matrix_2, features_2 = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=cutoff_times_df, dask_kwargs=dask_kwargs) assert features == features_2 for column in feature_matrix: for x, y in zip(feature_matrix[column], feature_matrix_2[column]): assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_ignores_instance_ids_if_cutoff_df(entities, relationships): cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3], "time": [10, 12, 15]}) instance_ids = [1, 2, 3, 4, 5] feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=cutoff_times_df, instance_ids=instance_ids) assert len(feature_matrix.index) == 3 assert len(feature_matrix.columns) == len(features)
def test_all_variables(entities, relationships): cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3], "time": [10, 12, 15]}) instance_ids = [1, 2, 3, 4, 5] feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=cutoff_times_df, instance_ids=instance_ids, agg_primitives=[Max, Mean, Min, Sum], trans_primitives=[], max_depth=3, allowed_paths=None, ignore_entities=None, ignore_variables=None, seed_features=None) assert len(feature_matrix.index) == 3 assert len(feature_matrix.columns) == len(features)
def test_approximate_features(entities, relationships): cutoff_times_df = pd.DataFrame({"instance_id": [1, 3, 1, 5, 3, 6], "time": [11, 16, 16, 26, 17, 22]}) feature_matrix, features = dfs(entities=entities, relationships=relationships, target_entity="transactions", cutoff_time=cutoff_times_df, approximate=5, cutoff_time_in_index=True) direct_agg_feat_name = 'cards.PERCENT_TRUE(transactions.fraud)' assert len(feature_matrix.index) == 6 assert len(feature_matrix.columns) == len(features) truth_index = pd.MultiIndex.from_arrays([[1, 3, 1, 5, 3, 6], [11, 16, 16, 26, 17, 22]], names=('id', 'time')) truth_values = pd.Series(data=[1.0, 0.5, 0.5, 1.0, 0.5, 1.0], index=truth_index) truth_values.sort_index(level='time', kind='mergesort', inplace=True) assert (feature_matrix[direct_agg_feat_name] == truth_values).all()
def test_mock_customer(): es = load_mock_customer(return_entityset=True) fm, fl = dfs(entityset=es, target_entity="customers", max_depth=3) for feature in fl: assert feature.get_name() in fm.columns
def test_features_only(entities, relationships): features = dfs(entities=entities, relationships=relationships, target_entity="transactions", features_only=True) assert len(features) > 0