def test_direct(es):
    d1 = DirectFeature(IdentityFeature(es['customers'].ww['engagement_level']),
                       'sessions')
    d2 = DirectFeature(d1, 'log')
    graph = graph_feature(d2).source

    d1_name = d1.get_name()
    d2_name = d2.get_name()
    prim_node1 = '1_{}_join'.format(d1_name)
    prim_node2 = '0_{}_join'.format(d2_name)

    log_table = '\u2605 log (target)'
    sessions_table = 'sessions'
    customers_table = 'customers'
    groupby_edge1 = '"{}" -> sessions:customer_id'.format(prim_node1)
    groupby_edge2 = '"{}" -> log:session_id'.format(prim_node2)
    groupby_input1 = 'customers:engagement_level -> "{}"'.format(prim_node1)
    groupby_input2 = 'sessions:"{}" -> "{}"'.format(d1_name, prim_node2)
    d1_edge = '"{}" -> sessions:"{}"'.format(prim_node1, d1_name)
    d2_edge = '"{}" -> log:"{}"'.format(prim_node2, d2_name)

    graph_components = [
        d1_name, d2_name, prim_node1, prim_node2, log_table, sessions_table,
        customers_table, groupby_edge1, groupby_edge2, groupby_input1,
        groupby_input2, d1_edge, d2_edge
    ]
    for component in graph_components:
        assert component in graph

    dataframes = {
        'customers': [customers_table, 'engagement_level'],
        'sessions': [sessions_table, 'customer_id', d1_name],
        'log': [log_table, 'session_id', d2_name]
    }

    for dataframe in dataframes:
        regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe)
        matches = re.findall(regex, graph, re.DOTALL)
        assert len(matches) == 1

        rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
        assert len(rows) == len(dataframes[dataframe])
        for row in rows:
            matched = False
            for i in dataframes[dataframe]:
                if i in row:
                    matched = True
                    dataframes[dataframe].remove(i)
                    break
            assert matched
Exemple #2
0
def test_make_dfeat_of_agg_feat_on_self(es):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the dataframe we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(es["customers"].ww["id"],
                                     parent_dataframe_name="régions",
                                     primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_dataframe_name="customers")

    feature_set = FeatureSet([num_customers_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index="id")
    v = df[num_customers_feat.get_name()].values[0]
    assert v == 3
def test_dfs_builds_on_seed_features_more_than_max_depth(es):
    seed_feature_sessions = ft.Feature(es['log']["id"],
                                       parent_entity=es['sessions'],
                                       primitive=Count)
    seed_feature_log = ft.Feature(es['log']['datetime'], primitive=Hour)
    session_agg = ft.Feature(seed_feature_log,
                             parent_entity=es['sessions'],
                             primitive=Last)

    # Depth of this feat is 2 relative to session_agg, the seed feature,
    # which is greater than max_depth so it shouldn't be built
    session_agg_trans = DirectFeature(
        ft.Feature(session_agg, parent_entity=es['customers'], primitive=Mode),
        es['sessions'])
    dfs_obj = DeepFeatureSynthesis(
        target_entity_id='sessions',
        entityset=es,
        agg_primitives=[Last, Count],
        trans_primitives=[],
        max_depth=1,
        seed_features=[seed_feature_sessions, seed_feature_log])
    features = dfs_obj.build_features()
    assert seed_feature_sessions.get_name() in [f.get_name() for f in features]
    assert session_agg.get_name() in [f.get_name() for f in features]
    assert session_agg_trans.get_name() not in [f.get_name() for f in features]
Exemple #4
0
def test_make_dfeat_of_agg_feat_through_parent(es, backend):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \\     R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(es['stores']['id'])

    store_count_feat = ft.Feature(store_id_feat,
                                  parent_entity=es[u'régions'],
                                  primitive=Count)

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_entity=es['customers'])

    pandas_backend = backend([num_stores_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_stores_feat.get_name()][0]
    assert (v == 3)
def test_make_dfeat_of_agg_feat_on_self(es):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the entity we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(es['customers']['id'],
                                     parent_entity=es[u'régions'],
                                     primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_entity=es['customers'])

    feature_set = FeatureSet([num_customers_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[num_customers_feat.get_name()][0]
    assert (v == 3)
Exemple #6
0
def test_make_dfeat_of_agg_feat_through_parent(es):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \\     R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(es['stores']['id'])

    store_count_feat = ft.Feature(store_id_feat,
                                  parent_entity=es[u'régions'],
                                  primitive=Count)

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_entity=es['customers'])

    feature_set = FeatureSet([num_stores_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[num_stores_feat.get_name()].values[0]
    assert (v == 3)
Exemple #7
0
def test_make_dfeat(es, backend):
    f = DirectFeature(es['customers']['age'], child_entity=es['sessions'])

    pandas_backend = backend([f])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[f.get_name()][0]
    assert (v == 33)
Exemple #8
0
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 5],
                                               time_last=None)
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
Exemple #9
0
def test_direct_rename(es):
    # should be same behavior as test_direct_from_identity
    feat = DirectFeature(base_feature=es['sessions']['device_type'],
                         child_entity=es['log'])
    copy_feat = feat.rename("session_test")
    assert feat.hash() != copy_feat.hash()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()
    assert feat.entity == copy_feat.entity
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None)
    df = calculator.run([0, 5])
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_direct_rename_multioutput(es):
    n_common = ft.Feature(es['log']['product_id'],
                          parent_entity=es['customers'],
                          primitive=NMostCommon(n=2))
    feat = DirectFeature(n_common, es['sessions'])
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()
    assert feat.entity == copy_feat.entity
def test_make_dfeat(es):
    f = DirectFeature(es['customers']['age'], child_entity=es['sessions'])

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[f.get_name()][0]
    assert (v == 33)
def test_direct_rename(es):
    # should be same behavior as test_direct_from_identity
    feat = DirectFeature(base_feature=IdentityFeature(
        es['sessions'].ww['device_type']),
                         child_dataframe_name='log')
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name(
    ) == copy_feat.base_features[0].generate_name()
    assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_rename_multioutput(es):
    n_common = Feature(es['log'].ww['product_id'],
                       parent_dataframe_name='customers',
                       primitive=NMostCommon(n=2))
    feat = DirectFeature(n_common, 'sessions')
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name(
    ) == copy_feat.base_features[0].generate_name()
    assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_from_variable(es):
    # should be same behavior as test_direct_from_identity
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_make_dfeat(es):
    f = DirectFeature(ft.Feature(es['customers'].ww['age']),
                      child_dataframe_name='sessions')

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[f.get_name()][0]
    assert (v == 33)
def test_groupby_transform_direct_groupby(es):
    groupby = DirectFeature(IdentityFeature(es['cohorts'].ww['cohort_name']),
                            'customers')
    feat = GroupByTransformFeature(IdentityFeature(es['customers'].ww['age']),
                                   CumMax, groupby)
    graph = graph_feature(feat).source

    groupby_name = groupby.get_name()
    feat_name = feat.get_name()
    join_node = '1_{}_join'.format(groupby_name)
    prim_node = "0_{}_cum_max".format(feat_name)
    groupby_node = '{}_groupby_customers--{}'.format(feat_name, groupby_name)
    customers_table = '\u2605 customers (target)'
    cohorts_table = 'cohorts'

    join_groupby = '"{}" -> customers:cohort'.format(join_node)
    join_input = 'cohorts:cohort_name -> "{}"'.format(join_node)
    join_out_edge = '"{}" -> customers:"{}"'.format(join_node, groupby_name)
    groupby_edge = 'customers:"{}" -> "{}"'.format(groupby_name, groupby_node)
    groupby_input = 'customers:age -> "{}"'.format(groupby_node)
    prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node)
    feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name)

    graph_components = [
        groupby_name, feat_name, join_node, prim_node, groupby_node,
        customers_table, cohorts_table, join_groupby, join_input,
        join_out_edge, groupby_edge, groupby_input, prim_input, feat_edge
    ]
    for component in graph_components:
        assert component in graph

    dataframes = {
        'cohorts': [cohorts_table, 'cohort_name'],
        'customers':
        [customers_table, 'cohort', 'age', groupby_name, feat_name]
    }
    for dataframe in dataframes:
        regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe)
        matches = re.findall(regex, graph, re.DOTALL)
        assert len(matches) == 1

        rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
        assert len(rows) == len(dataframes[dataframe])

        for row in rows:
            matched = False
            for i in dataframes[dataframe]:
                if i in row:
                    matched = True
                    dataframes[dataframe].remove(i)
                    break
            assert matched
Exemple #18
0
def test_direct_rename_multioutput(es):
    n_common = Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    feat = DirectFeature(n_common, "sessions")
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert (feat.base_features[0].generate_name() ==
            copy_feat.base_features[0].generate_name())
    assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id').sort_index()
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_direct_with_multiple_possible_paths(games_es):
    error_text = "There are multiple relationships to the base dataframe. " \
                 "You must specify a relationship."
    with pytest.raises(RuntimeError, match=error_text):
        DirectFeature(IdentityFeature(games_es['teams'].ww['name']), 'games')

    # Does not raise if path specified.
    relationship = next(r for r in games_es.get_forward_relationships('games')
                        if r._child_column_name == 'home_team_id')
    feat = DirectFeature(IdentityFeature(games_es['teams'].ww['name']),
                         'games',
                         relationship=relationship)
    assert feat.relationship_path_name() == 'teams[home_team_id]'
    assert feat.get_name() == 'teams[home_team_id].name'
Exemple #21
0
def test_serialization(es):
    value = ft.IdentityFeature(es["products"].ww["rating"])
    direct = DirectFeature(value, "log")

    log_to_products = next(r for r in es.get_forward_relationships("log")
                           if r.parent_dataframe.ww.name == "products")
    dictionary = {
        "name": direct.get_name(),
        "base_feature": value.unique_name(),
        "relationship": log_to_products.to_dictionary(),
    }

    assert dictionary == direct.get_arguments()
    assert direct == DirectFeature.from_dictionary(
        dictionary, es, {value.unique_name(): value}, PrimitivesDeserializer())
Exemple #22
0
def test_direct_with_multiple_possible_paths(games_es):
    error_text = ("There are multiple relationships to the base dataframe. "
                  "You must specify a relationship.")
    with pytest.raises(RuntimeError, match=error_text):
        DirectFeature(IdentityFeature(games_es["teams"].ww["name"]), "games")

    # Does not raise if path specified.
    relationship = next(r for r in games_es.get_forward_relationships("games")
                        if r._child_column_name == "home_team_id")
    feat = DirectFeature(
        IdentityFeature(games_es["teams"].ww["name"]),
        "games",
        relationship=relationship,
    )
    assert feat.relationship_path_name() == "teams[home_team_id]"
    assert feat.get_name() == "teams[home_team_id].name"
Exemple #23
0
def test_direct_from_identity(es):
    device = Feature(es["sessions"].ww["device_type"])
    d = DirectFeature(base_feature=device, child_dataframe_name="log")

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index="id", sort_index=True)
    v = df[d.get_name()].tolist()
    if es.dataframe_type == Library.SPARK.value:
        expected = ["0", "1"]
    else:
        expected = [0, 1]
    assert v == expected
def test_direct_from_identity(es):
    device = Feature(es['sessions'].ww['device_type'])
    d = DirectFeature(base_feature=device, child_dataframe_name='log')

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index='id', sort_index=True)
    v = df[d.get_name()].tolist()
    if es.dataframe_type == Library.KOALAS.value:
        expected = ['0', '1']
    else:
        expected = [0, 1]
    assert v == expected
def test_make_dfeat_of_agg_feat_on_self(entityset, backend):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the entity we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(entityset['customers']['id'], parent_entity=entityset[u'régions'], primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat, child_entity=entityset['customers'])

    pandas_backend = backend([num_customers_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_customers_feat.get_name()][0]
    assert (v == 3)
Exemple #26
0
def test_direct_with_single_possible_path(es):
    feat = DirectFeature(IdentityFeature(es["customers"].ww["age"]),
                         "sessions")
    assert feat.relationship_path_name() == "customers"
    assert feat.get_name() == "customers.age"
def test_direct_with_single_possible_path(es):
    feat = DirectFeature(IdentityFeature(es['customers'].ww['age']),
                         'sessions')
    assert feat.relationship_path_name() == 'customers'
    assert feat.get_name() == 'customers.age'