def test_metadata(es, tmpdir): identity_feature_descriptions = {'sessions: device_name': 'the name of the device used for each session', 'customers: id': "the customer's id"} agg_feat = AggregationFeature(IdentityFeature(es['sessions'].ww['device_name']), 'customers', NumUnique) agg_description = 'The number of unique elements in the name of the device used for each '\ 'session of all instances of "sessions" for each customer\'s id.' assert describe_feature(agg_feat, feature_descriptions=identity_feature_descriptions) == agg_description transform_feat = GroupByTransformFeature(IdentityFeature(es['log'].ww['value']), CumMean, IdentityFeature(es['log'].ww['session_id'])) transform_description = 'The running average of the "value" for each "session_id".' primitive_templates = {"cum_mean": "the running average of {}"} assert describe_feature(transform_feat, primitive_templates=primitive_templates) == transform_description custom_agg = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'sessions', Mode) auto_description = 'The most frequently occurring value of the "zipcode" of all instances of "log" for each "id" in "sessions".' custom_agg_description = "the most frequently used zipcode" custom_feature_description = custom_agg_description[0].upper() + custom_agg_description[1:] + '.' feature_description_dict = {'sessions: MODE(log.zipcode)': custom_agg_description} assert describe_feature(custom_agg) == auto_description assert describe_feature(custom_agg, feature_descriptions=feature_description_dict) == custom_feature_description metadata = { 'feature_descriptions': {**identity_feature_descriptions, **feature_description_dict}, 'primitive_templates': primitive_templates } metadata_path = os.path.join(tmpdir, 'description_metadata.json') with open(metadata_path, 'w') as f: json.dump(metadata, f) assert describe_feature(agg_feat, metadata_file=metadata_path) == agg_description assert describe_feature(transform_feat, metadata_file=metadata_path) == transform_description assert describe_feature(custom_agg, metadata_file=metadata_path) == custom_feature_description
def test_direct_description(es): feature = DirectFeature( IdentityFeature(es["customers"].ww["loves_ice_cream"]), "sessions" ) description = ( 'The "loves_ice_cream" for the instance of "customers" associated ' 'with this instance of "sessions".' ) assert describe_feature(feature) == description deep_direct = DirectFeature(feature, "log") deep_description = ( 'The "loves_ice_cream" for the instance of "customers" ' 'associated with the instance of "sessions" associated with ' 'this instance of "log".' ) assert describe_feature(deep_direct) == deep_description agg = AggregationFeature( IdentityFeature(es["log"].ww["purchased"]), "sessions", PercentTrue ) complicated_direct = DirectFeature(agg, "log") agg_on_direct = AggregationFeature(complicated_direct, "products", Mean) complicated_description = ( "The average of the percentage of true values in " 'the "purchased" of all instances of "log" for each "id" in "sessions" for ' 'the instance of "sessions" associated with this instance of "log" of all ' 'instances of "log" for each "id" in "products".' ) assert describe_feature(agg_on_direct) == complicated_description
def test_aggregation_description(es): feature = AggregationFeature(IdentityFeature(es['log'].ww['value']), 'sessions', Mean) description = 'The average of the "value" of all instances of "log" for each "id" in "sessions".' assert describe_feature(feature) == description stacked_agg = AggregationFeature(feature, 'customers', Sum) stacked_description = 'The sum of t{} of all instances of "sessions" for each "id" ' \ 'in "customers".'.format(description[1:-1]) assert describe_feature(stacked_agg) == stacked_description
def test_generic_description(es): class NoName(TransformPrimitive): input_types = [ColumnSchema(semantic_tags={'category'})] output_type = ColumnSchema(semantic_tags={'category'}) def generate_name(self, base_feature_names): return u"%s(%s%s)" % ( 'NO_NAME', u", ".join(base_feature_names), self.get_args_string(), ) class CustomAgg(AggregationPrimitive): name = 'custom_aggregation' input_types = [ColumnSchema(semantic_tags={'category'})] output_type = ColumnSchema(semantic_tags={'category'}) class CustomTrans(TransformPrimitive): name = 'custom_transform' input_types = [ColumnSchema(semantic_tags={'category'})] output_type = ColumnSchema(semantic_tags={'category'}) no_name = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), NoName) no_name_description = 'The result of applying NoName to the "zipcode".' assert describe_feature(no_name) == no_name_description custom_agg = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'customers', CustomAgg) custom_agg_description = 'The result of applying CUSTOM_AGGREGATION to the "zipcode" of all instances of "log" for each "id" in "customers".' assert describe_feature(custom_agg) == custom_agg_description custom_trans = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), CustomTrans) custom_trans_description = 'The result of applying CUSTOM_TRANSFORM to the "zipcode".' assert describe_feature(custom_trans) == custom_trans_description
def test_stacked(es, trans_feat): stacked = AggregationFeature(trans_feat, es['cohorts'], Mode) graph = graph_feature(stacked).source feat_name = stacked.get_name() intermediate_name = trans_feat.get_name() agg_primitive = '0_{}_mode'.format(feat_name) trans_primitive = '1_{}_year'.format(intermediate_name) groupby_node = '{}_groupby_customers--cohort'.format(feat_name) trans_prim_edge = 'customers:cancel_date -> "{}"'.format(trans_primitive) intermediate_edge = '"{}" -> customers:"{}"'.format(trans_primitive, intermediate_name) groupby_edge = 'customers:cohort -> "{}"'.format(groupby_node) groupby_input = 'customers:"{}" -> "{}"'.format(intermediate_name, groupby_node) agg_input = '"{}" -> "{}"'.format(groupby_node, agg_primitive) feat_edge = '"{}" -> cohorts:"{}"'.format(agg_primitive, feat_name) graph_components = [feat_name, intermediate_name, agg_primitive, trans_primitive, groupby_node, trans_prim_edge, intermediate_edge, groupby_edge, groupby_input, agg_input, feat_edge] for component in graph_components: assert component in graph agg_primitive = agg_primitive.replace('(', '\\(').replace(')', '\\)') agg_node = re.findall('"{}" \\[label.*'.format(agg_primitive), graph) assert len(agg_node) == 1 assert 'Step 2' in agg_node[0] trans_primitive = trans_primitive.replace('(', '\\(').replace(')', '\\)') trans_node = re.findall('"{}" \\[label.*'.format(trans_primitive), graph) assert len(trans_node) == 1 assert 'Step 1' in trans_node[0]
def test_aggregation_description_use_previous(es): feature = AggregationFeature( IdentityFeature(es["log"].ww["value"]), "sessions", Mean, use_previous="5d" ) description = 'The average of the "value" of the previous 5 days of "log" for each "id" in "sessions".' assert describe_feature(feature) == description
def test_aggregation_description_use_previous(es): feature = AggregationFeature(es['log']['value'], es['sessions'], Mean, use_previous='5d') description = 'The average of the "value" of the previous 5 days of "log" for each "id" in "sessions".' assert describe_feature(feature) == description
def test_aggregation_description_where(es): where_feature = TransformFeature(IdentityFeature(es['log'].ww['countrycode']), EqualScalar('US')) feature = AggregationFeature(IdentityFeature(es['log'].ww['value']), 'sessions', Mean, where=where_feature) description = 'The average of the "value" of all instances of "log" where the ' \ '"countrycode" is US for each "id" in "sessions".' assert describe_feature(feature) == description
def test_multioutput_description(es): n_most_common = NMostCommon(2) n_most_common_feature = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'sessions', n_most_common) first_most_common_slice = n_most_common_feature[0] second_most_common_slice = n_most_common_feature[1] n_most_common_base = 'The 2 most common values of the "zipcode" of all instances of "log" for each "id" in "sessions".' n_most_common_first = 'The most common value of the "zipcode" of all instances of "log" ' \ 'for each "id" in "sessions".' n_most_common_second = 'The 2nd most common value of the "zipcode" of all instances of ' \ '"log" for each "id" in "sessions".' assert describe_feature(n_most_common_feature) == n_most_common_base assert describe_feature(first_most_common_slice) == n_most_common_first assert describe_feature(second_most_common_slice) == n_most_common_second class CustomMultiOutput(TransformPrimitive): name = "custom_multioutput" input_types = [ColumnSchema(semantic_tags={'category'})] return_type = ColumnSchema(semantic_tags={'category'}) number_output_features = 4 custom_feat = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), CustomMultiOutput) generic_base = 'The result of applying CUSTOM_MULTIOUTPUT to the "zipcode".' generic_first = 'The 1st output from applying CUSTOM_MULTIOUTPUT to the "zipcode".' generic_second = 'The 2nd output from applying CUSTOM_MULTIOUTPUT to the "zipcode".' assert describe_feature(custom_feat) == generic_base assert describe_feature(custom_feat[0]) == generic_first assert describe_feature(custom_feat[1]) == generic_second CustomMultiOutput.description_template = ['the multioutput of {}', 'the {nth_slice} multioutput part of {}'] template_base = 'The multioutput of the "zipcode".' template_first_slice = 'The 1st multioutput part of the "zipcode".' template_second_slice = 'The 2nd multioutput part of the "zipcode".' template_third_slice = 'The 3rd multioutput part of the "zipcode".' template_fourth_slice = 'The 4th multioutput part of the "zipcode".' assert describe_feature(custom_feat) == template_base assert describe_feature(custom_feat[0]) == template_first_slice assert describe_feature(custom_feat[1]) == template_second_slice assert describe_feature(custom_feat[2]) == template_third_slice assert describe_feature(custom_feat[3]) == template_fourth_slice CustomMultiOutput.description_template = ['the multioutput of {}', 'the primary multioutput part of {}', 'the secondary multioutput part of {}'] custom_base = 'The multioutput of the "zipcode".' custom_first_slice = 'The primary multioutput part of the "zipcode".' custom_second_slice = 'The secondary multioutput part of the "zipcode".' bad_slice_error = 'Slice out of range of template' assert describe_feature(custom_feat) == custom_base assert describe_feature(custom_feat[0]) == custom_first_slice assert describe_feature(custom_feat[1]) == custom_second_slice with pytest.raises(IndexError, match=bad_slice_error): describe_feature(custom_feat[2])
def test_direct_description(es): feature = DirectFeature(IdentityFeature(es['customers'].ww['loves_ice_cream']), 'sessions') description = 'The "loves_ice_cream" for the instance of "customers" associated ' \ 'with this instance of "sessions".' assert describe_feature(feature) == description deep_direct = DirectFeature(feature, 'log') deep_description = 'The "loves_ice_cream" for the instance of "customers" ' \ 'associated with the instance of "sessions" associated with ' \ 'this instance of "log".' assert describe_feature(deep_direct) == deep_description agg = AggregationFeature(IdentityFeature(es['log'].ww['purchased']), 'sessions', PercentTrue) complicated_direct = DirectFeature(agg, 'log') agg_on_direct = AggregationFeature(complicated_direct, 'products', Mean) complicated_description = 'The average of the percentage of true values in ' \ 'the "purchased" of all instances of "log" for each "id" in "sessions" for ' \ 'the instance of "sessions" associated with this instance of "log" of all ' \ 'instances of "log" for each "id" in "products".' assert describe_feature(agg_on_direct) == complicated_description
def test_multioutput(es): multioutput = AggregationFeature( IdentityFeature(es["log"].ww["zipcode"]), "sessions", NMostCommon ) feat = FeatureOutputSlice(multioutput, 0) graph = graph_feature(feat).source feat_name = feat.get_name() prim_node = "0_{}_n_most_common".format(multioutput.get_name()) groupby_node = "{}_groupby_log--session_id".format(multioutput.get_name()) sessions_table = "\u2605 sessions (target)" log_table = "log" groupby_edge = 'log:session_id -> "{}"'.format(groupby_node) groupby_input = 'log:zipcode -> "{}"'.format(groupby_node) prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node) feat_edge = '"{}" -> sessions:"{}"'.format(prim_node, feat_name) graph_components = [ feat_name, prim_node, groupby_node, sessions_table, log_table, groupby_edge, groupby_input, prim_input, feat_edge, ] for component in graph_components: assert component in graph dataframes = { "log": [log_table, "zipcode", "session_id"], "sessions": [sessions_table, feat_name], } for dataframe in dataframes: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(dataframes[dataframe]) for row in rows: matched = False for i in dataframes[dataframe]: if i in row: matched = True dataframes[dataframe].remove(i) break assert matched
def test_aggregation_description_where(es): where_feature = TransformFeature( IdentityFeature(es["log"].ww["countrycode"]), EqualScalar("US") ) feature = AggregationFeature( IdentityFeature(es["log"].ww["value"]), "sessions", Mean, where=where_feature ) description = ( 'The average of the "value" of all instances of "log" where the ' '"countrycode" is US for each "id" in "sessions".' ) assert describe_feature(feature) == description
def test_multioutput(es): multioutput = AggregationFeature(es['log']['zipcode'], es['sessions'], NMostCommon) feat = FeatureOutputSlice(multioutput, 0) graph = graph_feature(feat).source feat_name = feat.get_name() prim_node = '0_{}_n_most_common'.format(multioutput.get_name()) groupby_node = '{}_groupby_log--session_id'.format(multioutput.get_name()) sessions_table = '\u2605 sessions (target)' log_table = 'log' groupby_edge = 'log:session_id -> "{}"'.format(groupby_node) groupby_input = 'log:zipcode -> "{}"'.format(groupby_node) prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node) feat_edge = '"{}" -> sessions:"{}"'.format(prim_node, feat_name) graph_components = [ feat_name, prim_node, groupby_node, sessions_table, log_table, groupby_edge, groupby_input, prim_input, feat_edge ] for component in graph_components: assert component in graph entities = { 'log': [log_table, 'zipcode', 'session_id'], 'sessions': [sessions_table, feat_name] } for entity in entities: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(entity) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(entities[entity]) for row in rows: matched = False for i in entities[entity]: if i in row: matched = True entities[entity].remove(i) break assert matched
def test_aggregation(es): feat = AggregationFeature(IdentityFeature(es['log'].ww['id']), 'sessions', Count) graph = graph_feature(feat).source feat_name = feat.get_name() prim_node = '0_{}_count'.format(feat_name) groupby_node = '{}_groupby_log--session_id'.format(feat_name) sessions_table = '\u2605 sessions (target)' log_table = 'log' groupby_edge = 'log:session_id -> "{}"'.format(groupby_node) groupby_input = 'log:id -> "{}"'.format(groupby_node) prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node) feat_edge = '"{}" -> sessions:"{}"'.format(prim_node, feat_name) graph_components = [ feat_name, prim_node, groupby_node, sessions_table, log_table, groupby_edge, groupby_input, prim_input, feat_edge ] for component in graph_components: assert component in graph dataframes = { 'log': [log_table, 'id', 'session_id'], 'sessions': [sessions_table, feat_name] } for dataframe in dataframes: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(dataframes[dataframe]) for row in rows: matched = False for i in dataframes[dataframe]: if i in row: matched = True dataframes[dataframe].remove(i) break assert matched
def _build_agg_features(self, all_features, parent_entity, child_entity, max_depth=0): if max_depth is not None and max_depth < 0: return new_max_depth = None if max_depth is not None: new_max_depth = max_depth - 1 for agg_prim in self.agg_primitives: # if multiple input_types, only use first one for DFS input_types = agg_prim.input_types if type(input_types[0]) == list: input_types = input_types[0] features = self._features_by_type(all_features=all_features, entity=child_entity, variable_type=set(input_types), max_depth=new_max_depth) # remove features in relationship path relationship_path = self.es.find_backward_path( parent_entity.id, child_entity.id) features = [ f for f in features if not self._feature_in_relationship_path(relationship_path, f) ] matching_inputs = match(input_types, features, commutative=agg_prim.commutative) wheres = list(self.where_clauses[child_entity.id]) for matching_input in matching_inputs: if not check_stacking(agg_prim, matching_input): continue new_f = AggregationFeature(matching_input, parent_entity=parent_entity, primitive=agg_prim) self._handle_new_feature(new_f, all_features) # Obey allow where if not agg_prim.allow_where: continue # limit the stacking of where features # count up the the number of where features # in this feature and its dependencies feat_wheres = [] for f in matching_input: if isinstance(f, AggregationFeature) and f.where is not None: feat_wheres.append(f) for feat in f.get_dependencies(deep=True): if (isinstance(feat, AggregationFeature) and feat.where is not None): feat_wheres.append(feat) if len(feat_wheres) >= self.where_stacking_limit: continue # limits the aggregation feature by the given allowed feature types. if not any([ issubclass(type(agg_prim), type(primitive)) for primitive in self.where_primitives ]): continue for where in wheres: # limits the where feats so they are different than base feats base_hashes = [f.hash() for f in new_f.base_features] if any([ base_feat.hash() in base_hashes for base_feat in where.base_features ]): continue new_f = AggregationFeature(matching_input, parent_entity=parent_entity, where=where, primitive=agg_prim) self._handle_new_feature(new_f, all_features)
def _build_agg_features(self, all_features, parent_entity, child_entity, max_depth, relationship_path): new_max_depth = None if max_depth is not None: new_max_depth = max_depth - 1 for agg_prim in self.agg_primitives: current_options = self.primitive_options[agg_prim.name] if ignore_entity_for_primitive(current_options, child_entity): continue # if multiple input_types, only use first one for DFS input_types = agg_prim.input_types if type(input_types[0]) == list: input_types = input_types[0] def feature_filter(f): # Remove direct features of parent entity and features in relationship path. return (not _direct_of_entity(f, parent_entity)) \ and not self._feature_in_relationship_path(relationship_path, f) matching_inputs = self._get_matching_inputs( all_features, child_entity, new_max_depth, input_types, agg_prim, current_options, feature_filter=feature_filter) matching_inputs = filter_matches_by_options( matching_inputs, current_options) wheres = list(self.where_clauses[child_entity.id]) for matching_input in matching_inputs: if not check_stacking(agg_prim, matching_input): continue new_f = AggregationFeature(matching_input, parent_entity=parent_entity, relationship_path=relationship_path, primitive=agg_prim) self._handle_new_feature(new_f, all_features) # limit the stacking of where features # count up the the number of where features # in this feature and its dependencies feat_wheres = [] for f in matching_input: if isinstance(f, AggregationFeature) and f.where is not None: feat_wheres.append(f) for feat in f.get_dependencies(deep=True): if (isinstance(feat, AggregationFeature) and feat.where is not None): feat_wheres.append(feat) if len(feat_wheres) >= self.where_stacking_limit: continue # limits the aggregation feature by the given allowed feature types. if not any([ issubclass(type(agg_prim), type(primitive)) for primitive in self.where_primitives ]): continue for where in wheres: # limits the where feats so they are different than base feats base_names = [f.unique_name() for f in new_f.base_features] if any([ base_feat.unique_name() in base_names for base_feat in where.base_features ]): continue new_f = AggregationFeature( matching_input, parent_entity=parent_entity, relationship_path=relationship_path, where=where, primitive=agg_prim) self._handle_new_feature(new_f, all_features)
def test_metadata(es, tmpdir): identity_feature_descriptions = { "sessions: device_name": "the name of the device used for each session", "customers: id": "the customer's id", } agg_feat = AggregationFeature( IdentityFeature(es["sessions"].ww["device_name"]), "customers", NumUnique ) agg_description = ( "The number of unique elements in the name of the device used for each " 'session of all instances of "sessions" for each customer\'s id.' ) assert ( describe_feature(agg_feat, feature_descriptions=identity_feature_descriptions) == agg_description ) transform_feat = GroupByTransformFeature( IdentityFeature(es["log"].ww["value"]), CumMean, IdentityFeature(es["log"].ww["session_id"]), ) transform_description = 'The running average of the "value" for each "session_id".' primitive_templates = {"cum_mean": "the running average of {}"} assert ( describe_feature(transform_feat, primitive_templates=primitive_templates) == transform_description ) custom_agg = AggregationFeature( IdentityFeature(es["log"].ww["zipcode"]), "sessions", Mode ) auto_description = 'The most frequently occurring value of the "zipcode" of all instances of "log" for each "id" in "sessions".' custom_agg_description = "the most frequently used zipcode" custom_feature_description = ( custom_agg_description[0].upper() + custom_agg_description[1:] + "." ) feature_description_dict = {"sessions: MODE(log.zipcode)": custom_agg_description} assert describe_feature(custom_agg) == auto_description assert ( describe_feature(custom_agg, feature_descriptions=feature_description_dict) == custom_feature_description ) metadata = { "feature_descriptions": { **identity_feature_descriptions, **feature_description_dict, }, "primitive_templates": primitive_templates, } metadata_path = os.path.join(tmpdir, "description_metadata.json") with open(metadata_path, "w") as f: json.dump(metadata, f) assert describe_feature(agg_feat, metadata_file=metadata_path) == agg_description assert ( describe_feature(transform_feat, metadata_file=metadata_path) == transform_description ) assert ( describe_feature(custom_agg, metadata_file=metadata_path) == custom_feature_description )