def test_dfs_empty_features():
    error_text = "No features can be generated from the specified primitives. Please make sure the primitives you are using are compatible with the variable types in your data."
    teams = pd.DataFrame({"id": range(3), "name": ["Breakers", "Spirit", "Thorns"]})
    games = pd.DataFrame(
        {
            "id": range(5),
            "home_team_id": [2, 2, 1, 0, 1],
            "away_team_id": [1, 0, 2, 1, 0],
            "home_team_score": [3, 0, 1, 0, 4],
            "away_team_score": [2, 1, 2, 0, 0],
        }
    )
    dataframes = {
        "teams": (teams, "id", None, {"name": "natural_language"}),
        "games": (games, "id"),
    }
    relationships = [("teams", "id", "games", "home_team_id")]
    with patch.object(DeepFeatureSynthesis, "build_features", return_value=[]):
        features = dfs(
            dataframes, relationships, target_dataframe_name="teams", features_only=True
        )
        assert features == []
    with pytest.raises(AssertionError, match=error_text), patch.object(
        DeepFeatureSynthesis, "build_features", return_value=[]
    ):
        dfs(
            dataframes,
            relationships,
            target_dataframe_name="teams",
            features_only=False,
        )
def test_no_warns_with_camel_and_title_case(es):
    for trans_primitive in ["isNull", "IsNull"]:
        # Should not raise a UnusedPrimitiveWarning warning
        with pytest.warns(None) as record:
            dfs(
                entityset=es,
                target_dataframe_name="customers",
                trans_primitives=[trans_primitive],
                max_depth=1,
                features_only=True,
            )

        assert not record

    for agg_primitive in ["numUnique", "NumUnique"]:
        # Should not raise a UnusedPrimitiveWarning warning
        with pytest.warns(None) as record:
            dfs(
                entityset=es,
                target_dataframe_name="customers",
                agg_primitives=[agg_primitive],
                max_depth=2,
                features_only=True,
            )

        assert not record
Exemple #3
0
def test_warns_with_unused_primitives(es):
    if es.dataframe_type == Library.KOALAS.value:
        pytest.skip('Koalas throws extra warnings')
    trans_primitives = ['num_characters', 'num_words', 'add_numeric']
    agg_primitives = [Max, 'min']

    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  trans_primitives: ['add_numeric']\n  agg_primitives: ['max', 'min']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible columns for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=es,
            target_dataframe_name='customers',
            trans_primitives=trans_primitives,
            agg_primitives=agg_primitives,
            max_depth=1,
            features_only=True)

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(entityset=es,
            target_dataframe_name='customers',
            trans_primitives=trans_primitives,
            agg_primitives=agg_primitives,
            max_depth=2,
            features_only=True)

    assert not record
Exemple #4
0
def test_calls_progress_callback(entities, relationships):
    class MockProgressCallback:
        def __init__(self):
            self.total_update = 0
            self.total_progress_percent = 0

        def __call__(self, update, progress_percent, time_elapsed):
            self.total_update += update
            self.total_progress_percent = progress_percent

    mock_progress_callback = MockProgressCallback()

    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   progress_callback=mock_progress_callback)

    assert np.isclose(mock_progress_callback.total_update, 100.0)
    assert np.isclose(mock_progress_callback.total_progress_percent, 100.0)

    # test with multiple jobs
    mock_progress_callback = MockProgressCallback()

    with cluster() as (scheduler, [a, b]):
        dkwargs = {'cluster': scheduler['address']}
        feature_matrix, features = dfs(entities=entities,
                                       relationships=relationships,
                                       target_entity="transactions",
                                       progress_callback=mock_progress_callback,
                                       dask_kwargs=dkwargs)

    assert np.isclose(mock_progress_callback.total_update, 100.0)
    assert np.isclose(mock_progress_callback.total_progress_percent, 100.0)
def test_calls_progress_callback_cluster(pd_dataframes, relationships, dask_cluster):
    class MockProgressCallback:
        def __init__(self):
            self.progress_history = []
            self.total_update = 0
            self.total_progress_percent = 0

        def __call__(self, update, progress_percent, time_elapsed):
            self.total_update += update
            self.total_progress_percent = progress_percent
            self.progress_history.append(progress_percent)

    mock_progress_callback = MockProgressCallback()

    dkwargs = {"cluster": dask_cluster.scheduler.address}
    dfs(
        dataframes=pd_dataframes,
        relationships=relationships,
        target_dataframe_name="transactions",
        progress_callback=mock_progress_callback,
        dask_kwargs=dkwargs,
    )

    assert np.isclose(mock_progress_callback.total_update, 100.0)
    assert np.isclose(mock_progress_callback.total_progress_percent, 100.0)
def test_calls_progress_callback(dataframes, relationships):
    class MockProgressCallback:
        def __init__(self):
            self.progress_history = []
            self.total_update = 0
            self.total_progress_percent = 0

        def __call__(self, update, progress_percent, time_elapsed):
            self.total_update += update
            self.total_progress_percent = progress_percent
            self.progress_history.append(progress_percent)

    mock_progress_callback = MockProgressCallback()

    dfs(
        dataframes=dataframes,
        relationships=relationships,
        target_dataframe_name="transactions",
        progress_callback=mock_progress_callback,
    )

    # second to last entry is the last update from feature calculation
    assert np.isclose(
        mock_progress_callback.progress_history[-2],
        FEATURE_CALCULATION_PERCENTAGE * 100,
    )
    assert np.isclose(mock_progress_callback.total_update, 100.0)
    assert np.isclose(mock_progress_callback.total_progress_percent, 100.0)
def test_warns_with_unused_groupby_primitives(pd_es):
    warning_text = (
        "Some specified primitives were not used during DFS:\n"
        + "  groupby_trans_primitives: ['cum_sum']\n"
        + "This may be caused by a using a value of max_depth that is too small, not setting interesting values, "
        + "or it may indicate no compatible columns for the primitive were found in the data. If the DFS call "
        + "contained multiple instances of a primitive in the list above, none of them were used."
    )

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(
            entityset=pd_es,
            target_dataframe_name="sessions",
            groupby_trans_primitives=["cum_sum"],
            max_depth=1,
            features_only=True,
        )

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(
            entityset=pd_es,
            target_dataframe_name="customers",
            groupby_trans_primitives=["cum_sum"],
            max_depth=1,
            features_only=True,
        )

    assert not record
def test_warns_with_unused_primitives(es):
    if ks and any(isinstance(e.df, ks.DataFrame) for e in es.entities):
        pytest.skip('Koalas throws extra warnings')
    trans_primitives = ['num_characters', 'num_words', 'add_numeric']
    agg_primitives = [Max, 'min']

    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  trans_primitives: ['add_numeric']\n  agg_primitives: ['max', 'min']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible variable types for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=es,
            target_entity='customers',
            trans_primitives=trans_primitives,
            agg_primitives=agg_primitives,
            max_depth=1)

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(entityset=es,
            target_entity='customers',
            trans_primitives=trans_primitives,
            agg_primitives=agg_primitives,
            max_depth=2)

    assert not record
def test_dask_kwargs(pd_dataframes, relationships, dask_cluster):
    cutoff_times_df = pd.DataFrame({
        "instance_id": [1, 2, 3],
        "time": [10, 12, 15]
    })
    feature_matrix, features = dfs(
        dataframes=pd_dataframes,
        relationships=relationships,
        target_dataframe_name="transactions",
        cutoff_time=cutoff_times_df,
    )

    dask_kwargs = {"cluster": dask_cluster.scheduler.address}
    feature_matrix_2, features_2 = dfs(
        dataframes=pd_dataframes,
        relationships=relationships,
        target_dataframe_name="transactions",
        cutoff_time=cutoff_times_df,
        dask_kwargs=dask_kwargs,
    )

    assert all(f1.unique_name() == f2.unique_name()
               for f1, f2 in zip(features, features_2))
    for column in feature_matrix:
        for x, y in zip(feature_matrix[column], feature_matrix_2[column]):
            assert (pd.isnull(x) and pd.isnull(y)) or (x == y)
Exemple #10
0
def test_accepts_relative_training_window(datetime_es):
    feature_matrix, features = dfs(entityset=datetime_es,
                                   target_entity="transactions")

    feature_matrix_2, features_2 = dfs(entityset=datetime_es,
                                       target_entity="transactions",
                                       cutoff_time=pd.Timestamp("2012-4-1 04:00"))

    feature_matrix_3, features_3 = dfs(entityset=datetime_es,
                                       target_entity="transactions",
                                       cutoff_time=pd.Timestamp("2012-4-1 04:00"),
                                       training_window=Timedelta("3 months"))

    feature_matrix_4, features_4 = dfs(entityset=datetime_es,
                                       target_entity="transactions",
                                       cutoff_time=pd.Timestamp("2012-4-1 04:00"),
                                       training_window="3 months")

    # Test case for leap years
    feature_matrix_5, features_5 = dfs(entityset=datetime_es,
                                       target_entity="transactions",
                                       cutoff_time=pd.Timestamp("2012-2-29 04:00"),
                                       training_window=Timedelta("1 year"))

    assert (feature_matrix.index == [1, 2, 3, 4, 5]).all()
    assert (feature_matrix_2.index == [1, 2, 3, 4]).all()
    assert (feature_matrix_3.index == [2, 3, 4]).all()
    assert (feature_matrix_4.index == [2, 3, 4]).all()
    assert (feature_matrix_5.index == [1, 2]).all()
Exemple #11
0
    def fit(self, cuttof_time_ids, y=None):
        """Wrapper for DFS

            Calculates a feature matrix and features given a dictionary of
            entities and a list of relationships.

            Args:
                cuttof_time_ids (list | DataFrame): Instances filtered to
                    calculate features on.

            See Also:
                :func:`synthesis.dfs`
        """
        if isinstance(cuttof_time_ids, (list, np.ndarray, pd.Series)):
            self.feature_defs = dfs(entities=self.entities,
                                    relationships=self.relationships,
                                    entityset=self.entityset,
                                    target_entity=self.target_entity,
                                    instance_ids=cuttof_time_ids,
                                    agg_primitives=self.agg_primitives,
                                    trans_primitives=self.trans_primitives,
                                    allowed_paths=self.allowed_paths,
                                    max_depth=self.max_depth,
                                    ignore_entities=self.ignore_entities,
                                    ignore_variables=self.ignore_variables,
                                    seed_features=self.seed_features,
                                    drop_contains=self.drop_contains,
                                    drop_exact=self.drop_exact,
                                    where_primitives=self.where_primitives,
                                    max_features=self.max_features,
                                    features_only=True,
                                    verbose=self.verbose)

        elif isinstance(cuttof_time_ids, pd.DataFrame):
            self.feature_defs = dfs(entities=self.entities,
                                    relationships=self.relationships,
                                    entityset=self.entityset,
                                    target_entity=self.target_entity,
                                    cutoff_time=cuttof_time_ids,
                                    agg_primitives=self.agg_primitives,
                                    trans_primitives=self.trans_primitives,
                                    allowed_paths=self.allowed_paths,
                                    max_depth=self.max_depth,
                                    ignore_entities=self.ignore_entities,
                                    ignore_variables=self.ignore_variables,
                                    seed_features=self.seed_features,
                                    drop_contains=self.drop_contains,
                                    drop_exact=self.drop_exact,
                                    where_primitives=self.where_primitives,
                                    max_features=self.max_features,
                                    features_only=True,
                                    verbose=self.verbose)
        else:
            raise TypeError(
                'instance_ids must be a list, np.ndarray, pd.Series, or pd.DataFrame'
            )

        return self
Exemple #12
0
    def fit(self, cuttof_time_ids, y=None):
        """Wrapper for DFS

            Calculates a feature matrix and features given a dictionary of
            entities and a list of relationships.

            Args:
                cuttof_time_ids (list | DataFrame): Instances filtered to
                    calculate features on.

            See Also:
                :func:`synthesis.dfs`
        """
        if isinstance(cuttof_time_ids, (list, np.ndarray, pd.Series)):
            self.feature_defs = dfs(entities=self.entities,
                                    relationships=self.relationships,
                                    entityset=self.entityset,
                                    target_entity=self.target_entity,
                                    instance_ids=cuttof_time_ids,
                                    agg_primitives=self.agg_primitives,
                                    trans_primitives=self.trans_primitives,
                                    allowed_paths=self.allowed_paths,
                                    max_depth=self.max_depth,
                                    ignore_entities=self.ignore_entities,
                                    ignore_variables=self.ignore_variables,
                                    seed_features=self.seed_features,
                                    drop_contains=self.drop_contains,
                                    drop_exact=self.drop_exact,
                                    where_primitives=self.where_primitives,
                                    max_features=self.max_features,
                                    features_only=True,
                                    verbose=self.verbose)

        elif isinstance(cuttof_time_ids, pd.DataFrame):
            self.feature_defs = dfs(entities=self.entities,
                                    relationships=self.relationships,
                                    entityset=self.entityset,
                                    target_entity=self.target_entity,
                                    cutoff_time=cuttof_time_ids,
                                    agg_primitives=self.agg_primitives,
                                    trans_primitives=self.trans_primitives,
                                    allowed_paths=self.allowed_paths,
                                    max_depth=self.max_depth,
                                    ignore_entities=self.ignore_entities,
                                    ignore_variables=self.ignore_variables,
                                    seed_features=self.seed_features,
                                    drop_contains=self.drop_contains,
                                    drop_exact=self.drop_exact,
                                    where_primitives=self.where_primitives,
                                    max_features=self.max_features,
                                    features_only=True,
                                    verbose=self.verbose)
        else:
            raise TypeError('instance_ids must be a list, np.ndarray, pd.Series, or pd.DataFrame')

        return self
def test_groupby_with_multioutput_primitive(pd_es):
    class MultiCumSum(TransformPrimitive):
        name = "multi_cum_sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        number_output_features = 3

        def get_function(self):
            def multi_cum_sum(x):
                return x.cumsum(), x.cummax(), x.cummin()

            return multi_cum_sum

    fm, _ = dfs(
        entityset=pd_es,
        target_dataframe_name="customers",
        trans_primitives=[],
        agg_primitives=[],
        groupby_trans_primitives=[MultiCumSum, CumSum, CumMax, CumMin],
    )

    # Calculate output in a separate DFS call to make sure the multi-output code
    # does not alter any values
    fm2, _ = dfs(
        entityset=pd_es,
        target_dataframe_name="customers",
        trans_primitives=[],
        agg_primitives=[],
        groupby_trans_primitives=[CumSum, CumMax, CumMin],
    )

    answer_cols = [
        ["CUM_SUM(age) by cohort", "CUM_SUM(age) by région_id"],
        ["CUM_MAX(age) by cohort", "CUM_MAX(age) by région_id"],
        ["CUM_MIN(age) by cohort", "CUM_MIN(age) by région_id"],
    ]

    for i in range(3):
        # Check that multi-output gives correct answers
        f = "MULTI_CUM_SUM(age)[%d] by cohort" % i
        assert f in fm.columns
        for x, y in zip(fm[f].values, fm[answer_cols[i][0]].values):
            assert x == y
        f = "MULTI_CUM_SUM(age)[%d] by région_id" % i
        assert f in fm.columns
        for x, y in zip(fm[f].values, fm[answer_cols[i][1]].values):
            assert x == y
        # Verify single output results are unchanged by inclusion of
        # multi-output primitive
        for x, y in zip(fm[answer_cols[i][0]], fm2[answer_cols[i][0]]):
            assert x == y
        for x, y in zip(fm[answer_cols[i][1]], fm2[answer_cols[i][1]]):
            assert x == y
Exemple #14
0
def test_warns_cutoff_time_dask(dataframes, relationships):
    cutoff_times_df = pd.DataFrame({
        "instance_id": [1, 2, 3],
        "time": [10, 12, 15]
    })
    cutoff_times_df = dd.from_pandas(cutoff_times_df, npartitions=2)
    match = "cutoff_time should be a Pandas DataFrame: " \
            "computing cutoff_time, this may take a while"
    with pytest.warns(UserWarning, match=match):
        dfs(dataframes=dataframes,
            relationships=relationships,
            target_dataframe_name="transactions",
            cutoff_time=cutoff_times_df)
Exemple #15
0
def test_accepts_pd_dateoffset_training_window(datetime_es):
    # TODO: Update to use Dask dataframes when issue #882 is closed
    feature_matrix, _ = dfs(entityset=datetime_es,
                            target_dataframe_name="transactions",
                            cutoff_time=pd.Timestamp("2012-3-31 04:00"),
                            training_window=pd.DateOffset(months=2))

    feature_matrix_2, _ = dfs(entityset=datetime_es,
                              target_dataframe_name="transactions",
                              cutoff_time=pd.Timestamp("2012-3-31 04:00"),
                              training_window=pd.offsets.BDay(44))

    assert (feature_matrix.index == [2, 3, 4]).all()
    assert (feature_matrix.index == feature_matrix_2.index).all()
def test_accepts_pd_dateoffset_training_window(datetime_es):
    feature_matrix, features = dfs(entityset=datetime_es,
                                   target_entity="transactions",
                                   cutoff_time=pd.Timestamp("2012-3-31 04:00"),
                                   training_window=pd.DateOffset(months=2))

    feature_matrix_2, features_2 = dfs(
        entityset=datetime_es,
        target_entity="transactions",
        cutoff_time=pd.Timestamp("2012-3-31 04:00"),
        training_window=pd.offsets.BDay(44))

    assert (feature_matrix.index == [2, 3, 4]).all()
    assert (feature_matrix.index == feature_matrix_2.index).all()
def test_does_not_warn_with_stacking_feature(pd_es):
    with pytest.warns(None) as record:
        dfs(entityset=pd_es,
            target_entity='régions',
            agg_primitives=['percent_true'],
            trans_primitives=[GreaterThanScalar(5)],
            primitive_options={
                'greater_than_scalar': {
                    'include_entities': ['stores']
                }
            },
            features_only=True)

    assert not record
def test_does_not_warn_with_stacking_feature(pd_es):
    with pytest.warns(None) as record:
        dfs(
            entityset=pd_es,
            target_dataframe_name="régions",
            agg_primitives=["percent_true"],
            trans_primitives=[GreaterThanScalar(5)],
            primitive_options={
                "greater_than_scalar": {"include_dataframes": ["stores"]}
            },
            features_only=True,
        )

    assert not record
def test_warns_with_unused_where_primitives(es):
    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  where_primitives: ['count', 'sum']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible variable types for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=es,
            target_entity='customers',
            agg_primitives=['count'],
            where_primitives=['sum', 'count'],
            max_depth=1)

    assert record[0].message.args[0] == warning_text
def test_warns_with_unused_custom_primitives(pd_es):
    def above_ten(column):
        return column > 10

    AboveTen = make_trans_primitive(function=above_ten,
                                    input_types=[Numeric],
                                    return_type=Numeric)

    trans_primitives = [AboveTen]

    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  trans_primitives: ['above_ten']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible variable types for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=pd_es,
            target_entity='sessions',
            trans_primitives=trans_primitives,
            max_depth=1)

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(entityset=pd_es,
            target_entity='customers',
            trans_primitives=trans_primitives,
            max_depth=1)

    def max_above_ten(column):
        return max(column) > 10

    MaxAboveTen = make_agg_primitive(function=max_above_ten,
                                     input_types=[Numeric],
                                     return_type=Numeric)

    agg_primitives = [MaxAboveTen]

    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  agg_primitives: ['max_above_ten']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible variable types for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=pd_es,
            target_entity='stores',
            agg_primitives=agg_primitives,
            max_depth=1)

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(entityset=pd_es,
            target_entity='sessions',
            agg_primitives=agg_primitives,
            max_depth=1)
Exemple #21
0
def test_accepts_cutoff_time_compose(entities, relationships):
    def fraud_occured(df):
        return df['fraud'].any()

    lm = cp.LabelMaker(target_entity='card_id',
                       time_index='transaction_time',
                       labeling_function=fraud_occured,
                       window_size=1)

    transactions_df = entities['transactions'][0]
    if isinstance(transactions_df, dd.DataFrame):
        transactions_df = transactions_df.compute()

    labels = lm.search(transactions_df, num_examples_per_instance=-1)

    labels['time'] = pd.to_numeric(labels['time'])
    labels.rename({'card_id': 'id'}, axis=1, inplace=True)

    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="cards",
                                   cutoff_time=labels)
    if isinstance(feature_matrix, dd.DataFrame):
        feature_matrix = feature_matrix.compute().set_index('id')
    assert len(feature_matrix.index) == 6
    assert len(feature_matrix.columns) == len(features) + 1
Exemple #22
0
def test_direct_of_multi_output_transform_feat(es):
    class TestTime(TransformPrimitive):
        name = "test_time"
        input_types = [Datetime]
        return_type = Numeric
        number_output_features = 6

        def get_function(self):
            def test_f(x):
                times = pd.Series(x)
                units = ["year", "month", "day", "hour", "minute", "second"]
                return [times.apply(lambda x: getattr(x, unit)) for unit in units]
            return test_f

    join_time_split = Feature(es["customers"]["signup_date"],
                              primitive=TestTime)
    alt_features = [Feature(es["customers"]["signup_date"], primitive=Year),
                    Feature(es["customers"]["signup_date"], primitive=Month),
                    Feature(es["customers"]["signup_date"], primitive=Day),
                    Feature(es["customers"]["signup_date"], primitive=Hour),
                    Feature(es["customers"]["signup_date"], primitive=Minute),
                    Feature(es["customers"]["signup_date"], primitive=Second)]
    fm, fl = dfs(
        entityset=es,
        target_entity="sessions",
        trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second])

    # Get column names of for multi feature and normal features
    subnames = DirectFeature(join_time_split, es["sessions"]).get_feature_names()
    altnames = [DirectFeature(f, es["sessions"]).get_name() for f in alt_features]

    # Check values are equal between
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()
Exemple #23
0
def test_accepts_single_cutoff_time(entities, relationships):
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   cutoff_time=20)
    assert len(feature_matrix.index) == 5
    assert len(feature_matrix.columns) == len(features)
Exemple #24
0
def test_accepts_pandas_training_window(datetime_es):
    feature_matrix, features = dfs(entityset=datetime_es,
                                   target_entity="transactions",
                                   cutoff_time=pd.Timestamp("2012-4-1 04:00"),
                                   training_window=pd.Timedelta(90, "D"))

    assert (feature_matrix.index == [2, 3, 4]).all()
def test_accepts_no_cutoff_time(entities, relationships):
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   instance_ids=[1, 2, 3, 5, 6])
    assert len(feature_matrix.index) == 5
    assert len(feature_matrix.columns) == len(features)
Exemple #26
0
def test_passing_strings_to_variable_types_dfs():
    variable_types = find_variable_types()
    teams = pd.DataFrame({
        'id': range(3),
        'name': ['Breakers', 'Spirit', 'Thorns']
    })
    games = pd.DataFrame({
        'id': range(5),
        'home_team_id': [2, 2, 1, 0, 1],
        'away_team_id': [1, 0, 2, 1, 0],
        'home_team_score': [3, 0, 1, 0, 4],
        'away_team_score': [2, 1, 2, 0, 0]
    })
    entities = {
        'teams': (teams, 'id', None, {
            'name': 'natural_language'
        }),
        'games': (games, 'id')
    }
    relationships = [('teams', 'id', 'games', 'home_team_id')]

    features = dfs(entities,
                   relationships,
                   target_entity="teams",
                   features_only=True)
    name_class = features[0].entity['name'].__class__
    assert name_class == variable_types['natural_language']
def test_accepts_cutoff_time_compose(dataframes, relationships):
    def fraud_occured(df):
        return df["fraud"].any()

    lm = cp.LabelMaker(
        target_dataframe_name="card_id",
        time_index="transaction_time",
        labeling_function=fraud_occured,
        window_size=1,
    )

    transactions_df = to_pandas(dataframes["transactions"][0])

    labels = lm.search(transactions_df, num_examples_per_instance=-1)

    labels["time"] = pd.to_numeric(labels["time"])
    labels.rename({"card_id": "id"}, axis=1, inplace=True)

    feature_matrix, features = dfs(
        dataframes=dataframes,
        relationships=relationships,
        target_dataframe_name="cards",
        cutoff_time=labels,
    )
    feature_matrix = to_pandas(feature_matrix, index="id")
    assert len(feature_matrix.index) == 6
    assert len(feature_matrix.columns) == len(features) + 1
def test_passing_strings_to_logical_types_dfs():
    teams = pd.DataFrame({
        "id": range(3),
        "name": ["Breakers", "Spirit", "Thorns"]
    })
    games = pd.DataFrame({
        "id": range(5),
        "home_team_id": [2, 2, 1, 0, 1],
        "away_team_id": [1, 0, 2, 1, 0],
        "home_team_score": [3, 0, 1, 0, 4],
        "away_team_score": [2, 1, 2, 0, 0],
    })
    dataframes = {
        "teams": (teams, "id", None, {
            "name": "natural_language"
        }),
        "games": (games, "id"),
    }
    relationships = [("teams", "id", "games", "home_team_id")]

    features = dfs(dataframes,
                   relationships,
                   target_dataframe_name="teams",
                   features_only=True)

    name_logical_type = features[0].dataframe["name"].ww.logical_type
    assert isinstance(name_logical_type, NaturalLanguage)
Exemple #29
0
def test_passing_strings_to_logical_types_dfs():
    teams = pd.DataFrame({
        'id': range(3),
        'name': ['Breakers', 'Spirit', 'Thorns']
    })
    games = pd.DataFrame({
        'id': range(5),
        'home_team_id': [2, 2, 1, 0, 1],
        'away_team_id': [1, 0, 2, 1, 0],
        'home_team_score': [3, 0, 1, 0, 4],
        'away_team_score': [2, 1, 2, 0, 0]
    })
    dataframes = {
        'teams': (teams, 'id', None, {
            'name': 'natural_language'
        }),
        'games': (games, 'id')
    }
    relationships = [('teams', 'id', 'games', 'home_team_id')]

    features = dfs(dataframes,
                   relationships,
                   target_dataframe_name="teams",
                   features_only=True)

    name_logical_type = features[0].dataframe['name'].ww.logical_type
    assert isinstance(name_logical_type, NaturalLanguage)
def test_accepts_single_cutoff_time(entities, relationships):
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   cutoff_time=20)
    assert len(feature_matrix.index) == 6
    assert len(feature_matrix.columns) == len(features)
Exemple #31
0
def test_accepts_no_cutoff_time(entities, relationships):
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   instance_ids=[1, 2, 3, 5, 6])
    assert len(feature_matrix.index) == 5
    assert len(feature_matrix.columns) == len(features)
Exemple #32
0
def test_direct_features_of_multi_output_agg_primitives(pd_es):
    class ThreeMostCommonCat(AggregationPrimitive):
        name = "n_most_common_categorical"
        input_types = [ColumnSchema(semantic_tags={"category"})]
        return_type = ColumnSchema(semantic_tags={"category"})
        number_output_features = 3

        def get_function(self, agg_type="pandas"):
            def pd_top3(x):
                counts = x.value_counts()
                counts = counts[counts > 0]
                array = np.array(counts.index[:3])
                if len(array) < 3:
                    filler = np.full(3 - len(array), np.nan)
                    array = np.append(array, filler)
                return array

            return pd_top3

    fm, fl = dfs(
        entityset=pd_es,
        target_dataframe_name="log",
        agg_primitives=[ThreeMostCommonCat],
        trans_primitives=[],
        max_depth=3,
    )

    has_nmost_as_base = []
    for feature in fl:
        is_base = False
        if len(feature.base_features) > 0 and isinstance(
                feature.base_features[0].primitive, ThreeMostCommonCat):
            is_base = True
        has_nmost_as_base.append(is_base)
    assert any(has_nmost_as_base)

    true_result_rows = []
    session_data = {
        0: ["coke zero", "car", np.nan],
        1: ["toothpaste", "brown bag", np.nan],
        2: ["brown bag", np.nan, np.nan],
        3: set(["Haribo sugar-free gummy bears", "coke zero", np.nan]),
        4: ["coke zero", np.nan, np.nan],
        5: ["taco clock", np.nan, np.nan],
    }
    for i, count in enumerate([5, 4, 1, 2, 3, 2]):
        while count > 0:
            true_result_rows.append(session_data[i])
            count -= 1

    tempname = "sessions.N_MOST_COMMON_CATEGORICAL(log.product_id)[%s]"
    for i, row in enumerate(true_result_rows):
        for j in range(3):
            value = fm[tempname % (j)][i]
            if isinstance(row, set):
                assert pd.isnull(value) or value in row
            else:
                assert (pd.isnull(value)
                        and pd.isnull(row[j])) or value == row[j]
Exemple #33
0
def test_accepts_pd_timedelta_training_window(datetime_es):
    # TODO: Update to use Dask entities when issue #882 is closed
    feature_matrix, features = dfs(entityset=datetime_es,
                                   target_entity="transactions",
                                   cutoff_time=pd.Timestamp("2012-3-31 04:00"),
                                   training_window=pd.Timedelta(61, "D"))

    assert (feature_matrix.index == [2, 3, 4]).all()
def test_accepts_cutoff_time_df(entities, relationships):
    cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3],
                                    "time": [10, 12, 15]})
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   cutoff_time=cutoff_times_df)
    assert len(feature_matrix.index) == 3
    assert len(feature_matrix.columns) == len(features)
def test_dask_kwargs(entities, relationships):
    cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3],
                                    "time": [10, 12, 15]})
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   cutoff_time=cutoff_times_df)

    with cluster() as (scheduler, [a, b]):
        dask_kwargs = {'cluster': scheduler['address']}
        feature_matrix_2, features_2 = dfs(entities=entities,
                                           relationships=relationships,
                                           target_entity="transactions",
                                           cutoff_time=cutoff_times_df,
                                           dask_kwargs=dask_kwargs)
    assert features == features_2
    for column in feature_matrix:
        for x, y in zip(feature_matrix[column], feature_matrix_2[column]):
            assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_ignores_instance_ids_if_cutoff_df(entities, relationships):
    cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3],
                                    "time": [10, 12, 15]})
    instance_ids = [1, 2, 3, 4, 5]
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   cutoff_time=cutoff_times_df,
                                   instance_ids=instance_ids)
    assert len(feature_matrix.index) == 3
    assert len(feature_matrix.columns) == len(features)
def test_all_variables(entities, relationships):
    cutoff_times_df = pd.DataFrame({"instance_id": [1, 2, 3],
                                    "time": [10, 12, 15]})
    instance_ids = [1, 2, 3, 4, 5]
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   cutoff_time=cutoff_times_df,
                                   instance_ids=instance_ids,
                                   agg_primitives=[Max, Mean, Min, Sum],
                                   trans_primitives=[],
                                   max_depth=3,
                                   allowed_paths=None,
                                   ignore_entities=None,
                                   ignore_variables=None,
                                   seed_features=None)
    assert len(feature_matrix.index) == 3
    assert len(feature_matrix.columns) == len(features)
def test_approximate_features(entities, relationships):
    cutoff_times_df = pd.DataFrame({"instance_id": [1, 3, 1, 5, 3, 6],
                                    "time": [11, 16, 16, 26, 17, 22]})
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   cutoff_time=cutoff_times_df,
                                   approximate=5,
                                   cutoff_time_in_index=True)
    direct_agg_feat_name = 'cards.PERCENT_TRUE(transactions.fraud)'
    assert len(feature_matrix.index) == 6
    assert len(feature_matrix.columns) == len(features)
    truth_index = pd.MultiIndex.from_arrays([[1, 3, 1, 5, 3, 6],
                                             [11, 16, 16, 26, 17, 22]],
                                            names=('id', 'time'))
    truth_values = pd.Series(data=[1.0, 0.5, 0.5, 1.0, 0.5, 1.0],
                             index=truth_index)
    truth_values.sort_index(level='time', kind='mergesort', inplace=True)

    assert (feature_matrix[direct_agg_feat_name] == truth_values).all()
def test_mock_customer():
    es = load_mock_customer(return_entityset=True)
    fm, fl = dfs(entityset=es, target_entity="customers", max_depth=3)
    for feature in fl:
        assert feature.get_name() in fm.columns
def test_features_only(entities, relationships):
    features = dfs(entities=entities,
                   relationships=relationships,
                   target_entity="transactions",
                   features_only=True)
    assert len(features) > 0