Esempio n. 1
0
def test_inplace_encodes_features(es):
    f1 = IdentityFeature(es["log"]["product_id"])

    features = [f1]
    feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_shape = feature_matrix.shape
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    assert feature_matrix_encoded.shape != feature_matrix_shape
    assert feature_matrix.shape == feature_matrix_shape

    # inplace they should be the same
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True)
    assert feature_matrix_encoded.shape == feature_matrix.shape
def test_to_encode_features(es):
    f1 = IdentityFeature(es["log"]["product_id"])
    f2 = IdentityFeature(es["log"]["value"])

    features = [f1, f2]
    feature_matrix = calculate_feature_matrix(features,
                                              es,
                                              instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ['value']
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
def test_direct(es):
    d1 = DirectFeature(IdentityFeature(es['customers'].ww['engagement_level']),
                       'sessions')
    d2 = DirectFeature(d1, 'log')
    graph = graph_feature(d2).source

    d1_name = d1.get_name()
    d2_name = d2.get_name()
    prim_node1 = '1_{}_join'.format(d1_name)
    prim_node2 = '0_{}_join'.format(d2_name)

    log_table = '\u2605 log (target)'
    sessions_table = 'sessions'
    customers_table = 'customers'
    groupby_edge1 = '"{}" -> sessions:customer_id'.format(prim_node1)
    groupby_edge2 = '"{}" -> log:session_id'.format(prim_node2)
    groupby_input1 = 'customers:engagement_level -> "{}"'.format(prim_node1)
    groupby_input2 = 'sessions:"{}" -> "{}"'.format(d1_name, prim_node2)
    d1_edge = '"{}" -> sessions:"{}"'.format(prim_node1, d1_name)
    d2_edge = '"{}" -> log:"{}"'.format(prim_node2, d2_name)

    graph_components = [
        d1_name, d2_name, prim_node1, prim_node2, log_table, sessions_table,
        customers_table, groupby_edge1, groupby_edge2, groupby_input1,
        groupby_input2, d1_edge, d2_edge
    ]
    for component in graph_components:
        assert component in graph

    dataframes = {
        'customers': [customers_table, 'engagement_level'],
        'sessions': [sessions_table, 'customer_id', d1_name],
        'log': [log_table, 'session_id', d2_name]
    }

    for dataframe in dataframes:
        regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe)
        matches = re.findall(regex, graph, re.DOTALL)
        assert len(matches) == 1

        rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
        assert len(rows) == len(dataframes[dataframe])
        for row in rows:
            matched = False
            for i in dataframes[dataframe]:
                if i in row:
                    matched = True
                    dataframes[dataframe].remove(i)
                    break
            assert matched
Esempio n. 4
0
def test_copy_features_does_not_copy_entityset(es):
    agg = ft.Feature(es['log']['value'],
                     parent_entity=es['sessions'],
                     primitive=Sum)
    agg_where = ft.Feature(es['log']['value'],
                           parent_entity=es['sessions'],
                           where=IdentityFeature(es['log']['value']) == 2,
                           primitive=Sum)
    agg_use_previous = ft.Feature(es['log']['value'],
                                  parent_entity=es['sessions'],
                                  use_previous='4 days',
                                  primitive=Sum)
    agg_use_previous_where = ft.Feature(es['log']['value'],
                                        parent_entity=es['sessions'],
                                        where=IdentityFeature(
                                            es['log']['value']) == 2,
                                        use_previous='4 days',
                                        primitive=Sum)
    features = [agg, agg_where, agg_use_previous, agg_use_previous_where]
    in_memory_size = asizeof(locals())
    copied = [f.copy() for f in features]
    new_in_memory_size = asizeof(locals())
    assert new_in_memory_size < 2 * in_memory_size
Esempio n. 5
0
def test_aggregation(es):
    feat = AggregationFeature(IdentityFeature(es["log"].ww["id"]), "sessions", Count)
    graph = graph_feature(feat).source

    feat_name = feat.get_name()
    prim_node = "0_{}_count".format(feat_name)
    groupby_node = "{}_groupby_log--session_id".format(feat_name)

    sessions_table = "\u2605 sessions (target)"
    log_table = "log"
    groupby_edge = 'log:session_id -> "{}"'.format(groupby_node)
    groupby_input = 'log:id -> "{}"'.format(groupby_node)
    prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node)
    feat_edge = '"{}" -> sessions:"{}"'.format(prim_node, feat_name)

    graph_components = [
        feat_name,
        prim_node,
        groupby_node,
        sessions_table,
        log_table,
        groupby_edge,
        groupby_input,
        prim_input,
        feat_edge,
    ]

    for component in graph_components:
        assert component in graph

    dataframes = {
        "log": [log_table, "id", "session_id"],
        "sessions": [sessions_table, feat_name],
    }
    for dataframe in dataframes:
        regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe)
        matches = re.findall(regex, graph, re.DOTALL)
        assert len(matches) == 1

        rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
        assert len(rows) == len(dataframes[dataframe])
        for row in rows:
            matched = False
            for i in dataframes[dataframe]:
                if i in row:
                    matched = True
                    dataframes[dataframe].remove(i)
                    break
            assert matched
Esempio n. 6
0
def test_make_agg_feat_where_count(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          where=IdentityFeature(
                              es['log']['product_id']) == 'coke zero',
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert (v == 3)
Esempio n. 7
0
def test_make_agg_feat_where_count(es):
    agg_feat = ft.Feature(
        es["log"].ww["id"],
        parent_dataframe_name="sessions",
        where=IdentityFeature(es["log"].ww["product_id"]) == "coke zero",
        primitive=Count,
    )

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert v == 3
def test_to_dictionary_direct(es):
    actual = ft.Feature(IdentityFeature(es["sessions"].ww["customer_id"]), "log").to_dictionary()

    expected = {
        'type': 'DirectFeature',
        'dependencies': ['sessions: customer_id'],
        'arguments': {'name': None,
                      'base_feature': 'sessions: customer_id',
                      'relationship': {'parent_dataframe_name': 'sessions',
                                       'child_dataframe_name': 'log',
                                       'parent_column_name': 'id',
                                       'child_column_name': 'session_id'}
                      }
    }

    assert expected == actual
Esempio n. 9
0
def test_direct_of_multi_output_transform_feat(es):
    # TODO: Update to work with Dask and Spark
    if es.dataframe_type != Library.PANDAS.value:
        pytest.xfail("Custom primitive is not compatible with Dask or Spark")

    class TestTime(TransformPrimitive):
        name = "test_time"
        input_types = [ColumnSchema(logical_type=Datetime)]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        number_output_features = 6

        def get_function(self):
            def test_f(x):
                times = pd.Series(x)
                units = ["year", "month", "day", "hour", "minute", "second"]
                return [
                    times.apply(lambda x: getattr(x, unit)) for unit in units
                ]

            return test_f

    base_feature = IdentityFeature(es["customers"].ww["signup_date"])
    join_time_split = Feature(base_feature, primitive=TestTime)
    alt_features = [
        Feature(base_feature, primitive=Year),
        Feature(base_feature, primitive=Month),
        Feature(base_feature, primitive=Day),
        Feature(base_feature, primitive=Hour),
        Feature(base_feature, primitive=Minute),
        Feature(base_feature, primitive=Second),
    ]
    fm, fl = dfs(
        entityset=es,
        target_dataframe_name="sessions",
        trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second],
    )

    # Get column names of for multi feature and normal features
    subnames = DirectFeature(join_time_split, "sessions").get_feature_names()
    altnames = [DirectFeature(f, "sessions").get_name() for f in alt_features]

    # Check values are equal between
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()
def test_multioutput(es):
    multioutput = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']),
                                     'sessions', NMostCommon)
    feat = FeatureOutputSlice(multioutput, 0)
    graph = graph_feature(feat).source

    feat_name = feat.get_name()
    prim_node = '0_{}_n_most_common'.format(multioutput.get_name())
    groupby_node = '{}_groupby_log--session_id'.format(multioutput.get_name())

    sessions_table = '\u2605 sessions (target)'
    log_table = 'log'
    groupby_edge = 'log:session_id -> "{}"'.format(groupby_node)
    groupby_input = 'log:zipcode -> "{}"'.format(groupby_node)
    prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node)
    feat_edge = '"{}" -> sessions:"{}"'.format(prim_node, feat_name)

    graph_components = [
        feat_name, prim_node, groupby_node, sessions_table, log_table,
        groupby_edge, groupby_input, prim_input, feat_edge
    ]

    for component in graph_components:
        assert component in graph

    dataframes = {
        'log': [log_table, 'zipcode', 'session_id'],
        'sessions': [sessions_table, feat_name]
    }
    for dataframe in dataframes:
        regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe)
        matches = re.findall(regex, graph, re.DOTALL)
        assert len(matches) == 1

        rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
        assert len(rows) == len(dataframes[dataframe])
        for row in rows:
            matched = False
            for i in dataframes[dataframe]:
                if i in row:
                    matched = True
                    dataframes[dataframe].remove(i)
                    break
            assert matched
Esempio n. 11
0
def test_to_dictionary_direct(es):
    actual = ft.Feature(
        IdentityFeature(es["sessions"].ww["customer_id"]), "log"
    ).to_dictionary()

    expected = {
        "type": "DirectFeature",
        "dependencies": ["sessions: customer_id"],
        "arguments": {
            "name": "sessions.customer_id",
            "base_feature": "sessions: customer_id",
            "relationship": {
                "parent_dataframe_name": "sessions",
                "child_dataframe_name": "log",
                "parent_column_name": "id",
                "child_column_name": "session_id",
            },
        },
    }

    assert expected == actual
Esempio n. 12
0
def test_make_agg_feat_multiple_dtypes(entityset, backend):
    compare_prod = IdentityFeature(entityset['log']['product_id']) == 'coke zero'

    agg_feat = ft.Feature(entityset['log']['id'],
                          parent_entity=entityset['sessions'],
                          where=compare_prod,
                          primitive=Count)

    agg_feat2 = ft.Feature(entityset['log']['product_id'],
                           parent_entity=entityset['sessions'],
                           where=compare_prod,
                           primitive=Mode)

    pandas_backend = backend([agg_feat, agg_feat2])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)

    v = df[agg_feat.get_name()][0]
    v2 = df[agg_feat2.get_name()][0]
    assert (v == 3)
    assert (v2 == 'coke zero')
Esempio n. 13
0
def test_make_agg_feat_where_count_or_device_type_feat(entityset, backend):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(entityset['log']['id'], parent_entity=entityset['sessions'], primitive=Count)

    compare_count = log_count_feat > 1
    compare_device_type = IdentityFeature(entityset['sessions']['device_type']) == 1
    or_feat = compare_count.OR(compare_device_type)
    feat = ft.Feature(entityset['sessions']['id'],
                      parent_entity=entityset['customers'],
                      where=or_feat,
                      primitive=Count)

    pandas_backend = backend([feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    name = feat.get_name()
    instances = df[name]
    assert (instances[0] == 3)
    def _add_identity_features(self, all_features, entity):
        """converts all variables from the given entity into features

        Args:
            all_features (dict[Entity.id -> dict[str -> BaseFeature]]):
                Dict containing a dict for each entity. Each nested dict
                has features as values with their ids as keys.
            entity (Entity): Entity to calculate features for.
        """
        variables = entity.variables
        for v in variables:
            new_f = IdentityFeature(variable=v)
            self._handle_new_feature(all_features=all_features,
                                     new_feature=new_f)

        # add seed features, if any, for dfs to build on top of
        # if there are any multi output features, this will build on
        # top of each output of the feature.
        for f in self.seed_features:
            if f.entity.id == entity.id:
                self._handle_new_feature(all_features=all_features,
                                         new_feature=f)
def test_make_agg_feat_multiple_dtypes(es):
    compare_prod = IdentityFeature(es['log']['product_id']) == 'coke zero'

    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          where=compare_prod,
                          primitive=Count)

    agg_feat2 = ft.Feature(es['log']['product_id'],
                           parent_entity=es['sessions'],
                           where=compare_prod,
                           primitive=Mode)

    feature_set = FeatureSet([agg_feat, agg_feat2])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))

    v = df[agg_feat.get_name()][0]
    v2 = df[agg_feat2.get_name()][0]
    assert (v == 3)
    assert (v2 == 'coke zero')
Esempio n. 16
0
def test_make_dfeat_of_agg_feat_through_parent(entityset, backend):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \\     R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(entityset['stores']['id'])

    store_count_feat = ft.Feature(store_id_feat, parent_entity=entityset[u'régions'], primitive=Count)

    num_stores_feat = DirectFeature(store_count_feat, child_entity=entityset['customers'])

    pandas_backend = backend([num_stores_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_stores_feat.get_name()][0]
    assert (v == 3)
def test_make_agg_feat_where_count_or_device_type_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count)

    compare_count = log_count_feat > 1
    compare_device_type = IdentityFeature(es['sessions']['device_type']) == 1
    or_feat = compare_count.OR(compare_device_type)
    feat = ft.Feature(es['sessions']['id'],
                      parent_entity=es['customers'],
                      where=or_feat,
                      primitive=Count)

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    name = feat.get_name()
    instances = df[name]
    assert (instances[0] == 3)
    def _add_identity_features(self, all_features, dataframe):
        """converts all columns from the given dataframe into features

        Args:
            all_features (dict[dataframe name -> dict[str -> BaseFeature]]):
                Dict containing a dict for each dataframe. Each nested dict
                has features as values with their ids as keys.
            dataframe (DataFrame): DataFrame to calculate features for.
        """
        for col in dataframe.columns:
            if col in self.ignore_columns[
                    dataframe.ww.name] or col == LTI_COLUMN_NAME:
                continue
            new_f = IdentityFeature(self.es[dataframe.ww.name].ww[col])
            self._handle_new_feature(all_features=all_features,
                                     new_feature=new_f)

        # add seed features, if any, for dfs to build on top of
        # if there are any multi output features, this will build on
        # top of each output of the feature.
        for f in self.seed_features:
            if f.dataframe_name == dataframe.ww.name:
                self._handle_new_feature(all_features=all_features,
                                         new_feature=f)
Esempio n. 19
0
def test_encode_features_handles_dictionary_input(es):
    f1 = IdentityFeature(es["log"]["product_id"])
    f2 = IdentityFeature(es["log"]["purchased"])
    f3 = IdentityFeature(es["log"]["session_id"])

    features = [f1, f2, f3]
    feature_matrix = calculate_feature_matrix(features, es, instance_ids=range(16))
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    true_values = ['product_id = coke zero', 'product_id = toothpaste', 'product_id = car',
                   'product_id = brown bag', 'product_id = taco clock', 'product_id = Haribo sugar-free gummy bears',
                   'product_id is unknown', 'purchased', 'session_id = 0', 'session_id = 1', 'session_id = 4',
                   'session_id = 3', 'session_id = 5', 'session_id = 2', 'session_id is unknown']
    assert len(features_encoded) == 15
    for col in true_values:
        assert col in list(feature_matrix_encoded.columns)

    top_n_dict = {}
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=top_n_dict)
    assert len(features_encoded) == 15
    for col in true_values:
        assert col in list(feature_matrix_encoded.columns)

    top_n_dict = {f1.get_name(): 4, f3.get_name(): 3}
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=top_n_dict)
    assert len(features_encoded) == 10
    true_values = ['product_id = coke zero', 'product_id = toothpaste', 'product_id = car',
                   'product_id = brown bag', 'product_id is unknown', 'purchased',
                   'session_id = 0', 'session_id = 1', 'session_id = 4', 'session_id is unknown']
    for col in true_values:
        assert col in list(feature_matrix_encoded.columns)

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=top_n_dict, include_unknown=False)
    true_values = ['product_id = coke zero', 'product_id = toothpaste', 'product_id = car',
                   'product_id = brown bag', 'purchased', 'session_id = 0', 'session_id = 1', 'session_id = 4']
    assert len(features_encoded) == 8
    for col in true_values:
        assert col in list(feature_matrix_encoded.columns)
Esempio n. 20
0
def test_variable_description(es):
    variable_description = 'the name of the device used for each session'
    es['sessions']['device_name'].description = variable_description
    identity_feat = IdentityFeature(es['sessions']['device_name'])
    assert describe_feature(identity_feat) == variable_description[0].upper(
    ) + variable_description[1:] + '.'
Esempio n. 21
0
def test_return_type_inference_direct_feature(es):
    mode = ft.Feature(es["log"].ww["priority_level"], parent_dataframe_name="customers", primitive=Mode)
    mode_session = ft.Feature(mode, "sessions")
    assert mode_session.column_schema == IdentityFeature(es["log"].ww["priority_level"]).column_schema
def test_aggregation_description_use_previous(es):
    feature = AggregationFeature(IdentityFeature(es['log'].ww['value']), 'sessions',
                                 Mean, use_previous='5d')
    description = 'The average of the "value" of the previous 5 days of "log" for each "id" in "sessions".'

    assert describe_feature(feature) == description
def test_groupby_transform_description(es):
    feature = GroupByTransformFeature(IdentityFeature(es['log'].ww['value']), CumMean, IdentityFeature(es['log'].ww['session_id']))
    description = 'The cumulative mean of the "value" for each "session_id".'

    assert describe_feature(feature) == description
def test_transform_description(es):
    feature = TransformFeature(IdentityFeature(es['log'].ww['value']), Absolute)
    description = 'The absolute value of the "value".'
    assert describe_feature(feature) == description
def test_direct_with_single_possible_path(es):
    feat = DirectFeature(IdentityFeature(es['customers'].ww['age']),
                         'sessions')
    assert feat.relationship_path_name() == 'customers'
    assert feat.get_name() == 'customers.age'
Esempio n. 26
0
def test_multioutput_description(es):
    n_most_common = NMostCommon(2)
    n_most_common_feature = AggregationFeature(
        IdentityFeature(es["log"].ww["zipcode"]), "sessions", n_most_common
    )
    first_most_common_slice = n_most_common_feature[0]
    second_most_common_slice = n_most_common_feature[1]

    n_most_common_base = 'The 2 most common values of the "zipcode" of all instances of "log" for each "id" in "sessions".'
    n_most_common_first = (
        'The most common value of the "zipcode" of all instances of "log" '
        'for each "id" in "sessions".'
    )
    n_most_common_second = (
        'The 2nd most common value of the "zipcode" of all instances of '
        '"log" for each "id" in "sessions".'
    )

    assert describe_feature(n_most_common_feature) == n_most_common_base
    assert describe_feature(first_most_common_slice) == n_most_common_first
    assert describe_feature(second_most_common_slice) == n_most_common_second

    class CustomMultiOutput(TransformPrimitive):
        name = "custom_multioutput"
        input_types = [ColumnSchema(semantic_tags={"category"})]
        return_type = ColumnSchema(semantic_tags={"category"})

        number_output_features = 4

    custom_feat = TransformFeature(
        IdentityFeature(es["log"].ww["zipcode"]), CustomMultiOutput
    )

    generic_base = 'The result of applying CUSTOM_MULTIOUTPUT to the "zipcode".'
    generic_first = 'The 1st output from applying CUSTOM_MULTIOUTPUT to the "zipcode".'
    generic_second = 'The 2nd output from applying CUSTOM_MULTIOUTPUT to the "zipcode".'

    assert describe_feature(custom_feat) == generic_base
    assert describe_feature(custom_feat[0]) == generic_first
    assert describe_feature(custom_feat[1]) == generic_second

    CustomMultiOutput.description_template = [
        "the multioutput of {}",
        "the {nth_slice} multioutput part of {}",
    ]
    template_base = 'The multioutput of the "zipcode".'
    template_first_slice = 'The 1st multioutput part of the "zipcode".'
    template_second_slice = 'The 2nd multioutput part of the "zipcode".'
    template_third_slice = 'The 3rd multioutput part of the "zipcode".'
    template_fourth_slice = 'The 4th multioutput part of the "zipcode".'
    assert describe_feature(custom_feat) == template_base
    assert describe_feature(custom_feat[0]) == template_first_slice
    assert describe_feature(custom_feat[1]) == template_second_slice
    assert describe_feature(custom_feat[2]) == template_third_slice
    assert describe_feature(custom_feat[3]) == template_fourth_slice

    CustomMultiOutput.description_template = [
        "the multioutput of {}",
        "the primary multioutput part of {}",
        "the secondary multioutput part of {}",
    ]
    custom_base = 'The multioutput of the "zipcode".'
    custom_first_slice = 'The primary multioutput part of the "zipcode".'
    custom_second_slice = 'The secondary multioutput part of the "zipcode".'
    bad_slice_error = "Slice out of range of template"
    assert describe_feature(custom_feat) == custom_base
    assert describe_feature(custom_feat[0]) == custom_first_slice
    assert describe_feature(custom_feat[1]) == custom_second_slice
    with pytest.raises(IndexError, match=bad_slice_error):
        describe_feature(custom_feat[2])
Esempio n. 27
0
def simple_feat(es):
    return IdentityFeature(es['log']['id'])
def test_column_description(es):
    column_description = 'the name of the device used for each session'
    es['sessions'].ww.columns['device_name'].description = column_description
    identity_feat = IdentityFeature(es['sessions'].ww['device_name'])
    assert describe_feature(identity_feat) == column_description[0].upper() + column_description[1:] + '.'
Esempio n. 29
0
def test_metadata(es, tmpdir):
    identity_feature_descriptions = {
        "sessions: device_name": "the name of the device used for each session",
        "customers: id": "the customer's id",
    }
    agg_feat = AggregationFeature(
        IdentityFeature(es["sessions"].ww["device_name"]), "customers", NumUnique
    )
    agg_description = (
        "The number of unique elements in the name of the device used for each "
        'session of all instances of "sessions" for each customer\'s id.'
    )
    assert (
        describe_feature(agg_feat, feature_descriptions=identity_feature_descriptions)
        == agg_description
    )

    transform_feat = GroupByTransformFeature(
        IdentityFeature(es["log"].ww["value"]),
        CumMean,
        IdentityFeature(es["log"].ww["session_id"]),
    )
    transform_description = 'The running average of the "value" for each "session_id".'
    primitive_templates = {"cum_mean": "the running average of {}"}
    assert (
        describe_feature(transform_feat, primitive_templates=primitive_templates)
        == transform_description
    )

    custom_agg = AggregationFeature(
        IdentityFeature(es["log"].ww["zipcode"]), "sessions", Mode
    )
    auto_description = 'The most frequently occurring value of the "zipcode" of all instances of "log" for each "id" in "sessions".'
    custom_agg_description = "the most frequently used zipcode"
    custom_feature_description = (
        custom_agg_description[0].upper() + custom_agg_description[1:] + "."
    )
    feature_description_dict = {"sessions: MODE(log.zipcode)": custom_agg_description}
    assert describe_feature(custom_agg) == auto_description
    assert (
        describe_feature(custom_agg, feature_descriptions=feature_description_dict)
        == custom_feature_description
    )

    metadata = {
        "feature_descriptions": {
            **identity_feature_descriptions,
            **feature_description_dict,
        },
        "primitive_templates": primitive_templates,
    }
    metadata_path = os.path.join(tmpdir, "description_metadata.json")
    with open(metadata_path, "w") as f:
        json.dump(metadata, f)
    assert describe_feature(agg_feat, metadata_file=metadata_path) == agg_description
    assert (
        describe_feature(transform_feat, metadata_file=metadata_path)
        == transform_description
    )
    assert (
        describe_feature(custom_agg, metadata_file=metadata_path)
        == custom_feature_description
    )
def test_identity_description(es):
    feature = IdentityFeature(es['log'].ww['session_id'])
    description = 'The "session_id".'

    assert describe_feature(feature) == description