def test_generic_description(es):
    class NoName(TransformPrimitive):
        input_types = [ColumnSchema(semantic_tags={'category'})]
        output_type = ColumnSchema(semantic_tags={'category'})

        def generate_name(self, base_feature_names):
            return u"%s(%s%s)" % (
                'NO_NAME',
                u", ".join(base_feature_names),
                self.get_args_string(),
            )

    class CustomAgg(AggregationPrimitive):
        name = 'custom_aggregation'
        input_types = [ColumnSchema(semantic_tags={'category'})]
        output_type = ColumnSchema(semantic_tags={'category'})

    class CustomTrans(TransformPrimitive):
        name = 'custom_transform'
        input_types = [ColumnSchema(semantic_tags={'category'})]
        output_type = ColumnSchema(semantic_tags={'category'})

    no_name = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), NoName)
    no_name_description = 'The result of applying NoName to the "zipcode".'
    assert describe_feature(no_name) == no_name_description

    custom_agg = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'customers', CustomAgg)
    custom_agg_description = 'The result of applying CUSTOM_AGGREGATION to the "zipcode" of all instances of "log" for each "id" in "customers".'
    assert describe_feature(custom_agg) == custom_agg_description

    custom_trans = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), CustomTrans)
    custom_trans_description = 'The result of applying CUSTOM_TRANSFORM to the "zipcode".'
    assert describe_feature(custom_trans) == custom_trans_description
def test_aggregation_description_where(es):
    where_feature = TransformFeature(IdentityFeature(es['log'].ww['countrycode']), EqualScalar('US'))
    feature = AggregationFeature(IdentityFeature(es['log'].ww['value']), 'sessions',
                                 Mean, where=where_feature)
    description = 'The average of the "value" of all instances of "log" where the ' \
                  '"countrycode" is US for each "id" in "sessions".'

    assert describe_feature(feature) == description
def test_multioutput_description(es):
    n_most_common = NMostCommon(2)
    n_most_common_feature = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'sessions', n_most_common)
    first_most_common_slice = n_most_common_feature[0]
    second_most_common_slice = n_most_common_feature[1]

    n_most_common_base = 'The 2 most common values of the "zipcode" of all instances of "log" for each "id" in "sessions".'
    n_most_common_first = 'The most common value of the "zipcode" of all instances of "log" ' \
                          'for each "id" in "sessions".'
    n_most_common_second = 'The 2nd most common value of the "zipcode" of all instances of ' \
                           '"log" for each "id" in "sessions".'

    assert describe_feature(n_most_common_feature) == n_most_common_base
    assert describe_feature(first_most_common_slice) == n_most_common_first
    assert describe_feature(second_most_common_slice) == n_most_common_second

    class CustomMultiOutput(TransformPrimitive):
        name = "custom_multioutput"
        input_types = [ColumnSchema(semantic_tags={'category'})]
        return_type = ColumnSchema(semantic_tags={'category'})

        number_output_features = 4

    custom_feat = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), CustomMultiOutput)

    generic_base = 'The result of applying CUSTOM_MULTIOUTPUT to the "zipcode".'
    generic_first = 'The 1st output from applying CUSTOM_MULTIOUTPUT to the "zipcode".'
    generic_second = 'The 2nd output from applying CUSTOM_MULTIOUTPUT to the "zipcode".'

    assert describe_feature(custom_feat) == generic_base
    assert describe_feature(custom_feat[0]) == generic_first
    assert describe_feature(custom_feat[1]) == generic_second

    CustomMultiOutput.description_template = ['the multioutput of {}',
                                              'the {nth_slice} multioutput part of {}']
    template_base = 'The multioutput of the "zipcode".'
    template_first_slice = 'The 1st multioutput part of the "zipcode".'
    template_second_slice = 'The 2nd multioutput part of the "zipcode".'
    template_third_slice = 'The 3rd multioutput part of the "zipcode".'
    template_fourth_slice = 'The 4th multioutput part of the "zipcode".'
    assert describe_feature(custom_feat) == template_base
    assert describe_feature(custom_feat[0]) == template_first_slice
    assert describe_feature(custom_feat[1]) == template_second_slice
    assert describe_feature(custom_feat[2]) == template_third_slice
    assert describe_feature(custom_feat[3]) == template_fourth_slice

    CustomMultiOutput.description_template = ['the multioutput of {}',
                                              'the primary multioutput part of {}',
                                              'the secondary multioutput part of {}']
    custom_base = 'The multioutput of the "zipcode".'
    custom_first_slice = 'The primary multioutput part of the "zipcode".'
    custom_second_slice = 'The secondary multioutput part of the "zipcode".'
    bad_slice_error = 'Slice out of range of template'
    assert describe_feature(custom_feat) == custom_base
    assert describe_feature(custom_feat[0]) == custom_first_slice
    assert describe_feature(custom_feat[1]) == custom_second_slice
    with pytest.raises(IndexError, match=bad_slice_error):
        describe_feature(custom_feat[2])
Beispiel #4
0
def test_aggregation_description_where(es):
    where_feature = TransformFeature(
        IdentityFeature(es["log"].ww["countrycode"]), EqualScalar("US")
    )
    feature = AggregationFeature(
        IdentityFeature(es["log"].ww["value"]), "sessions", Mean, where=where_feature
    )
    description = (
        'The average of the "value" of all instances of "log" where the '
        '"countrycode" is US for each "id" in "sessions".'
    )

    assert describe_feature(feature) == description
Beispiel #5
0
def test_stacked(es):
    trans_feat = TransformFeature(es['customers']['cancel_date'], Year)
    stacked = AggregationFeature(trans_feat, es['cohorts'], Mode)
    graph = graph_feature(stacked).source

    feat_name = stacked.get_name()
    intermediate_name = trans_feat.get_name()
    agg_primitive = '0_{}_mode'.format(feat_name)
    trans_primitive = '1_{}_year'.format(intermediate_name)
    groupby_node = '{}_groupby_customers--cohort'.format(feat_name)

    trans_prim_edge = 'customers:cancel_date -> "{}"'.format(trans_primitive)
    intermediate_edge = '"{}" -> customers:"{}"'.format(
        trans_primitive, intermediate_name)
    groupby_edge = 'customers:cohort -> "{}"'.format(groupby_node)
    groupby_input = 'customers:"{}" -> "{}"'.format(intermediate_name,
                                                    groupby_node)
    agg_input = '"{}" -> "{}"'.format(groupby_node, agg_primitive)
    feat_edge = '"{}" -> cohorts:"{}"'.format(agg_primitive, feat_name)

    graph_components = [
        feat_name, intermediate_name, agg_primitive, trans_primitive,
        groupby_node, trans_prim_edge, intermediate_edge, groupby_edge,
        groupby_input, agg_input, feat_edge
    ]
    for component in graph_components:
        assert component in graph

    agg_primitive = agg_primitive.replace('(', '\\(').replace(')', '\\)')
    agg_node = re.findall('"{}" \\[label.*'.format(agg_primitive), graph)
    assert len(agg_node) == 1
    assert 'Step 2' in agg_node[0]

    trans_primitive = trans_primitive.replace('(', '\\(').replace(')', '\\)')
    trans_node = re.findall('"{}" \\[label.*'.format(trans_primitive), graph)
    assert len(trans_node) == 1
    assert 'Step 1' in trans_node[0]
Beispiel #6
0
def test_transform(es):
    feat = TransformFeature(es['customers']['cancel_date'], Year)
    graph = graph_feature(feat).source

    feat_name = feat.get_name()
    prim_node = '0_{}_year'.format(feat_name)
    entity_table = '\u2605 customers (target)'
    prim_edge = 'customers:cancel_date -> "{}"'.format(prim_node)
    feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name)

    graph_components = [
        feat_name, entity_table, prim_node, prim_edge, feat_edge
    ]
    for component in graph_components:
        assert component in graph

    matches = re.findall(r"customers \[label=<\n<TABLE.*?</TABLE>>", graph,
                         re.DOTALL)
    assert len(matches) == 1
    rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
    assert len(rows) == 3
    to_match = ['customers', 'cancel_date', feat_name]
    for match, row in zip(to_match, rows):
        assert match in row
    def _build_transform_features(self, all_features, entity, max_depth=0):
        """Creates trans_features for all the variables in an entity

        Args:
            all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each entity. Each nested dict
                has features as values with their ids as keys

          entity (Entity): Entity to calculate features for.
        """
        if max_depth is not None and max_depth < 0:
            return

        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        self._add_identity_features(all_features, entity)

        for trans_prim in self.trans_primitives:
            # if multiple input_types, only use first one for DFS
            input_types = trans_prim.input_types
            if type(input_types[0]) == list:
                input_types = input_types[0]

            features = self._features_by_type(all_features=all_features,
                                              entity=entity,
                                              variable_type=set(input_types),
                                              max_depth=new_max_depth)

            matching_inputs = match(input_types,
                                    features,
                                    commutative=trans_prim.commutative)

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)
    def _build_transform_features(self,
                                  all_features,
                                  entity,
                                  max_depth=0,
                                  require_direct_input=False):
        """Creates trans_features for all the variables in an entity

        Args:
            all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each entity. Each nested dict
                has features as values with their ids as keys

          entity (Entity): Entity to calculate features for.
        """
        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        for trans_prim in self.trans_primitives:
            current_options = self.primitive_options[trans_prim.name]
            if ignore_entity_for_primitive(current_options, entity):
                continue
            # if multiple input_types, only use first one for DFS
            input_types = trans_prim.input_types
            if type(input_types[0]) == list:
                input_types = input_types[0]

            matching_inputs = self._get_matching_inputs(
                all_features,
                entity,
                new_max_depth,
                input_types,
                trans_prim,
                current_options,
                require_direct_input=require_direct_input)

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)

        for groupby_prim in self.groupby_trans_primitives:
            current_options = self.primitive_options[groupby_prim.name]
            if ignore_entity_for_primitive(current_options,
                                           entity,
                                           groupby=True):
                continue
            input_types = groupby_prim.input_types[:]
            # if multiple input_types, only use first one for DFS
            if type(input_types[0]) == list:
                input_types = input_types[0]
            matching_inputs = self._get_matching_inputs(
                all_features, entity, new_max_depth, input_types, groupby_prim,
                current_options)

            # get columns to use as groupbys, use IDs as default unless other groupbys specified
            if any([
                    'include_groupby_variables' in option
                    and entity.id in option['include_groupby_variables']
                    for option in current_options
            ]):
                default_type = variable_types.PandasTypes._all
            else:
                default_type = set([Id])
            groupby_matches = self._features_by_type(
                all_features=all_features,
                entity=entity,
                max_depth=new_max_depth,
                variable_type=default_type)
            groupby_matches = filter_groupby_matches_by_options(
                groupby_matches, current_options)

            # If require_direct_input, require a DirectFeature in input or as a
            # groupby, and don't create features of inputs/groupbys which are
            # all direct features with the same relationship path
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    for groupby in groupby_matches:
                        if require_direct_input and (
                                _all_direct_and_same_path(matching_input +
                                                          (groupby, ))
                                or not any([
                                    isinstance(feature, DirectFeature)
                                    for feature in (matching_input +
                                                    (groupby, ))
                                ])):
                            continue
                        new_f = GroupByTransformFeature(list(matching_input),
                                                        groupby=groupby[0],
                                                        primitive=groupby_prim)
                        self._handle_new_feature(all_features=all_features,
                                                 new_feature=new_f)
def test_transform_description(es):
    feature = TransformFeature(IdentityFeature(es['log'].ww['value']), Absolute)
    description = 'The absolute value of the "value".'
    assert describe_feature(feature) == description
Beispiel #10
0
    def _build_transform_features(self, all_features, entity, max_depth=0):
        """Creates trans_features for all the variables in an entity

        Args:
            all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each entity. Each nested dict
                has features as values with their ids as keys

          entity (Entity): Entity to calculate features for.
        """
        if max_depth is not None and max_depth < 0:
            return

        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        self._add_identity_features(all_features, entity)

        for trans_prim in self.trans_primitives:
            # if multiple input_types, only use first one for DFS
            input_types = trans_prim.input_types
            if type(input_types[0]) == list:
                input_types = input_types[0]

            features = self._features_by_type(all_features=all_features,
                                              entity=entity,
                                              max_depth=new_max_depth,
                                              variable_type=set(input_types))

            matching_inputs = match(input_types,
                                    features,
                                    commutative=trans_prim.commutative)

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)

        for groupby_prim in self.groupby_trans_primitives:
            # Normally input_types is a list of what inputs can be supplied to
            # the primitive function.  Here we temporarily add `Id` as an extra
            # item in input_types so that the matching function will also look
            # for feature columns to group by.
            input_types = groupby_prim.input_types[:]
            # if multiple input_types, only use first one for DFS
            if type(input_types[0]) == list:
                input_types = input_types[0]
            input_types.append(Id)

            features = self._features_by_type(all_features=all_features,
                                              entity=entity,
                                              max_depth=new_max_depth,
                                              variable_type=set(input_types))
            matching_inputs = match(input_types,
                                    features,
                                    commutative=groupby_prim.commutative)
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = GroupByTransformFeature(list(matching_input[:-1]),
                                                    groupby=matching_input[-1],
                                                    primitive=groupby_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)
    def _build_transform_features(self,
                                  all_features,
                                  dataframe,
                                  max_depth=0,
                                  require_direct_input=False):
        """Creates trans_features for all the columns in a dataframe

        Args:
            all_features (dict[dataframe name: dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each dataframe. Each nested dict
                has features as values with their ids as keys

          dataframe (DataFrame): DataFrame to calculate features for.
        """

        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        # Keep track of features to add until the end to avoid applying
        # transform primitives to features that were also built by transform primitives
        features_to_add = []

        for trans_prim in self.trans_primitives:
            current_options = self.primitive_options.get(
                trans_prim, self.primitive_options.get(trans_prim.name))
            if ignore_dataframe_for_primitive(current_options, dataframe):
                continue

            input_types = trans_prim.input_types

            matching_inputs = self._get_matching_inputs(
                all_features,
                dataframe,
                new_max_depth,
                input_types,
                trans_prim,
                current_options,
                require_direct_input=require_direct_input,
            )

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input) and check_transform_stacking(
                           matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    features_to_add.append(new_f)

        for groupby_prim in self.groupby_trans_primitives:
            current_options = self.primitive_options.get(
                groupby_prim, self.primitive_options.get(groupby_prim.name))
            if ignore_dataframe_for_primitive(current_options,
                                              dataframe,
                                              groupby=True):
                continue
            input_types = groupby_prim.input_types[:]
            matching_inputs = self._get_matching_inputs(
                all_features,
                dataframe,
                new_max_depth,
                input_types,
                groupby_prim,
                current_options,
            )

            # get columns to use as groupbys, use IDs as default unless other groupbys specified
            if any([
                    "include_groupby_columns" in option
                    and dataframe.ww.name in option["include_groupby_columns"]
                    for option in current_options
            ]):
                column_schemas = "all"
            else:
                column_schemas = [ColumnSchema(semantic_tags=["foreign_key"])]
            groupby_matches = self._features_by_type(
                all_features=all_features,
                dataframe=dataframe,
                max_depth=new_max_depth,
                column_schemas=column_schemas,
            )
            groupby_matches = filter_groupby_matches_by_options(
                groupby_matches, current_options)

            # If require_direct_input, require a DirectFeature in input or as a
            # groupby, and don't create features of inputs/groupbys which are
            # all direct features with the same relationship path
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input) and check_transform_stacking(
                           matching_input):
                    for groupby in groupby_matches:
                        if require_direct_input and (
                                _all_direct_and_same_path(matching_input +
                                                          (groupby, ))
                                or not any([
                                    isinstance(feature, DirectFeature)
                                    for feature in (matching_input +
                                                    (groupby, ))
                                ])):
                            continue
                        new_f = GroupByTransformFeature(
                            list(matching_input),
                            groupby=groupby[0],
                            primitive=groupby_prim,
                        )
                        features_to_add.append(new_f)
        for new_f in features_to_add:
            self._handle_new_feature(all_features=all_features,
                                     new_feature=new_f)
    def _build_transform_features(self,
                                  all_features,
                                  entity,
                                  max_depth=0,
                                  require_direct_input=False):
        """Creates trans_features for all the variables in an entity

        Args:
            all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each entity. Each nested dict
                has features as values with their ids as keys

          entity (Entity): Entity to calculate features for.
        """
        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        for trans_prim in self.trans_primitives:
            # if multiple input_types, only use first one for DFS
            input_types = trans_prim.input_types
            if type(input_types[0]) == list:
                input_types = input_types[0]

            matching_inputs = self._get_matching_inputs(
                all_features,
                entity,
                new_max_depth,
                input_types,
                trans_prim,
                require_direct_input=require_direct_input)

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)

        for groupby_prim in self.groupby_trans_primitives:
            input_types = groupby_prim.input_types[:]
            # if multiple input_types, only use first one for DFS
            if type(input_types[0]) == list:
                input_types = input_types[0]
            matching_inputs = self._get_matching_inputs(
                all_features,
                entity,
                new_max_depth,
                input_types,
                groupby_prim,
                require_direct_input=require_direct_input)
            # get IDs to use as groupby
            id_matches = self._features_by_type(all_features=all_features,
                                                entity=entity,
                                                max_depth=new_max_depth,
                                                variable_type=set([Id]))
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    for id_groupby in id_matches:
                        new_f = GroupByTransformFeature(list(matching_input),
                                                        groupby=id_groupby,
                                                        primitive=groupby_prim)
                        self._handle_new_feature(all_features=all_features,
                                                 new_feature=new_f)
def trans_feat(es):
    return TransformFeature(IdentityFeature(es['customers'].ww['cancel_date']),
                            Year)
def trans_feat(es):
    return TransformFeature(es['customers']['cancel_date'], Year)
Beispiel #15
0
def trans_feat(es):
    return TransformFeature(IdentityFeature(es["customers"].ww["cancel_date"]), Year)