def _init_primitive_options(primitive_options, es): # Flatten all tuple keys, convert value lists into sets, check for # conflicting keys flattened_options = {} for primitive_key, options in primitive_options.items(): if isinstance(options, list): primitive = primitives.get_aggregation_primitives().get(primitive_key) or \ primitives.get_transform_primitives().get(primitive_key) assert len(primitive.input_types[0]) == len(options) if \ isinstance(primitive.input_types[0], list) else \ len(primitive.input_types) == len(options), \ "Number of options does not match number of inputs for primitive %s" \ % (primitive_key) options = [ _init_option_dict(primitive_key, option, es) for option in options ] else: options = [_init_option_dict(primitive_key, options, es)] if not isinstance(primitive_key, tuple): primitive_key = (primitive_key, ) for each_primitive in primitive_key: # if primitive is specified more than once, raise error if each_primitive in flattened_options: raise KeyError('Multiple options found for primitive %s' % (each_primitive)) flattened_options[each_primitive] = options return flattened_options
def test_init_and_name(es): log = es['log'] features = [Feature(v) for v in log.variables] +\ [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)] # Add Timedelta feature features.append(pd.Timestamp.now() - Feature(log['datetime'])) for transform_prim in get_transform_primitives(): if issubclass(transform_prim, Compare): continue # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = [ g for s in input_types for g in match(s, features) ] else: matching_inputs = match(input_types, features) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for s in matching_inputs: instance = transform_prim(*s) # try to get name and calculate instance.get_name() instance.head()
def test_init_and_name(es): log = es['log'] rating = ft.Feature(es["products"]["rating"], es["log"]) features = [ft.Feature(v) for v in log.variables] +\ [ft.Feature(rating, primitive=GreaterThanScalar(2.5))] # Add Timedelta feature # features.append(pd.Timestamp.now() - ft.Feature(log['datetime'])) for transform_prim in get_transform_primitives().values(): # skip automated testing if a few special cases if transform_prim in [NotEqual, Equal]: continue # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features) else: matching_inputs = match(input_types, features) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for s in matching_inputs: instance = ft.Feature(s, primitive=transform_prim) # try to get name and calculate instance.get_name() ft.calculate_feature_matrix([instance], entityset=es).head(5)
def test_init_and_name(es): log = es['log'] features = [Feature(v) for v in log.variables] +\ [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)] # Add Timedelta feature features.append(pd.Timestamp.now() - Feature(log['datetime'])) for transform_prim in get_transform_primitives().values(): if issubclass(transform_prim, Compare): continue # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = [g for s in input_types for g in match(s, features)] else: matching_inputs = match(input_types, features) if len(matching_inputs) == 0: raise Exception( "Transform Primitive %s not tested" % transform_prim.name) for s in matching_inputs: instance = transform_prim(*s) # try to get name and calculate instance.get_name() instance.head()
def test_init_and_name(es): log = es['log'] rating = ft.Feature(ft.IdentityFeature(es["products"].ww["rating"]), "log") log_features = [ft.Feature(es['log'].ww[col]) for col in log.columns] +\ [ft.Feature(rating, primitive=GreaterThanScalar(2.5)), ft.Feature(rating, primitive=GreaterThanScalar(3.5))] # Add Timedelta feature # features.append(pd.Timestamp.now() - ft.Feature(log['datetime'])) customers_features = [ ft.Feature(es["customers"].ww[col]) for col in es["customers"].columns ] # check all transform primitives have a name for attribute_string in dir(ft.primitives): attr = getattr(ft.primitives, attribute_string) if isclass(attr): if issubclass(attr, TransformPrimitive) and attr != TransformPrimitive: assert getattr(attr, "name") is not None trans_primitives = get_transform_primitives().values() # If Dask EntitySet use only Dask compatible primitives if es.dataframe_type == Library.DASK.value: trans_primitives = [ prim for prim in trans_primitives if Library.DASK in prim.compatibility ] if es.dataframe_type == Library.KOALAS.value: trans_primitives = [ prim for prim in trans_primitives if Library.KOALAS in prim.compatibility ] for transform_prim in trans_primitives: # skip automated testing if a few special cases features_to_use = log_features if transform_prim in [NotEqual, Equal]: continue if transform_prim in [Age]: features_to_use = customers_features # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features_to_use) else: matching_inputs = match(input_types, features_to_use) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for prim in matching_inputs: instance = ft.Feature(prim, primitive=transform_prim) # try to get name and calculate instance.get_name() ft.calculate_feature_matrix([instance], entityset=es)
def test_summarize_primitives(): df = summarize_primitives() trans_prims = get_transform_primitives() agg_prims = get_aggregation_primitives() tot_trans = len(trans_prims) tot_agg = len(agg_prims) tot_prims = tot_trans + tot_agg assert df["Count"].iloc[0] == tot_prims assert df["Count"].iloc[1] == tot_agg assert df["Count"].iloc[2] == tot_trans
def test_list_primitives_order(): df = list_primitives() all_primitives = get_transform_primitives() all_primitives.update(get_aggregation_primitives()) for name, primitive in all_primitives.items(): assert name in df['name'].values row = df.loc[df['name'] == name].iloc[0] actual_desc = _get_descriptions([primitive])[0] if actual_desc: assert actual_desc == row['description'] types = df['type'].values assert 'aggregation' in types assert 'transform' in types
def check_trans_primitive(primitive): trans_prim_dict = primitives.get_transform_primitives() if isinstance(primitive, str): if primitive.lower() not in trans_prim_dict: raise ValueError("Unknown transform primitive {}. ".format(primitive), "Call ft.primitives.list_primitives() to get", " a list of available primitives") primitive = trans_prim_dict[primitive.lower()] primitive = handle_primitive(primitive) if not isinstance(primitive, TransformPrimitive): raise ValueError("Primitive {} in trans_primitives or " "groupby_trans_primitives is not a transform " "primitive".format(type(primitive))) return primitive
def _init_primitive_options(primitive_options, es): # Flatten all tuple keys, convert value lists into sets, check for # conflicting keys flattened_options = {} for primitive_keys, options in primitive_options.items(): if not isinstance(primitive_keys, tuple): primitive_keys = (primitive_keys, ) if isinstance(options, list): for primitive_key in primitive_keys: if isinstance(primitive_key, str): primitive = primitives.get_aggregation_primitives().get( primitive_key) or primitives.get_transform_primitives( ).get(primitive_key) if not primitive: msg = "Unknown primitive with name '{}'".format( primitive_key) raise ValueError(msg) else: primitive = primitive_key assert ( len(primitive.input_types[0]) == len(options) if isinstance(primitive.input_types[0], list) else len( primitive.input_types) == len(options) ), ("Number of options does not match number of inputs for primitive %s" % (primitive_key)) options = [ _init_option_dict(primitive_keys, option, es) for option in options ] else: options = [_init_option_dict(primitive_keys, options, es)] for primitive in primitive_keys: if isinstance(primitive, type): primitive = primitive.name # if primitive is specified more than once, raise error if primitive in flattened_options: raise KeyError("Multiple options found for primitive %s" % (primitive)) flattened_options[primitive] = options return flattened_options
def test_init_and_name(es): log = es['log'] rating = ft.Feature(es["products"]["rating"], es["log"]) log_features = [ft.Feature(v) for v in log.variables] +\ [ft.Feature(rating, primitive=GreaterThanScalar(2.5))] # Add Timedelta feature # features.append(pd.Timestamp.now() - ft.Feature(log['datetime'])) customers_features = [ft.Feature(v) for v in es["customers"].variables] trans_primitives = get_transform_primitives().values() # If Dask EntitySet use only Dask compatible primitives if isinstance(es['log'].df, dd.DataFrame): trans_primitives = [ prim for prim in trans_primitives if Library.DASK in prim.compatibility ] if ks and isinstance(es['log'].df, ks.DataFrame): trans_primitives = [ prim for prim in trans_primitives if Library.KOALAS in prim.compatibility ] for transform_prim in trans_primitives: # skip automated testing if a few special cases features_to_use = log_features if transform_prim in [NotEqual, Equal]: continue if transform_prim in [Age]: features_to_use = customers_features # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features_to_use) else: matching_inputs = match(input_types, features_to_use) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for prim in matching_inputs: instance = ft.Feature(prim, primitive=transform_prim) # try to get name and calculate instance.get_name() ft.calculate_feature_matrix([instance], entityset=es)
def test_list_primitives_order(): df = list_primitives() all_primitives = get_transform_primitives() all_primitives.update(get_aggregation_primitives()) for name, primitive in all_primitives.items(): assert name in df["name"].values row = df.loc[df["name"] == name].iloc[0] actual_desc = _get_descriptions([primitive])[0] if actual_desc: assert actual_desc == row["description"] assert row["dask_compatible"] == (Library.DASK in primitive.compatibility) assert row["valid_inputs"] == ", ".join( _get_unique_input_types(primitive.input_types) ) assert row["return_type"] == getattr(primitive.return_type, "__name__", None) types = df["type"].values assert "aggregation" in types assert "transform" in types
def test_list_primitives_order(): df = list_primitives() all_primitives = get_transform_primitives() all_primitives.update(get_aggregation_primitives()) for name, primitive in all_primitives.items(): assert name in df['name'].values row = df.loc[df['name'] == name].iloc[0] actual_desc = _get_descriptions([primitive])[0] if actual_desc: assert actual_desc == row['description'] assert row['dask_compatible'] == (Library.DASK in primitive.compatibility) assert row['valid_inputs'] == ', '.join( _get_names_valid_inputs(primitive.input_types)) assert row['return_type'] == getattr(primitive.return_type, '__name__', None) types = df['type'].values assert 'aggregation' in types assert 'transform' in types
def test_init_and_name(es): from featuretools import calculate_feature_matrix log = es['log'] features = [Feature(v) for v in log.variables] +\ [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)] # Add Timedelta feature features.append(pd.Timestamp.now() - Feature(log['datetime'])) for transform_prim in get_transform_primitives().values(): # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features) else: matching_inputs = match(input_types, features) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for s in matching_inputs: instance = transform_prim(*s) # try to get name and calculate instance.get_name() calculate_feature_matrix([instance], entityset=es).head(5)
def test_init_and_name(es): from featuretools import calculate_feature_matrix log = es['log'] features = [Feature(v) for v in log.variables] +\ [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)] # Add Timedelta feature features.append(pd.Timestamp.now() - Feature(log['datetime'])) for transform_prim in get_transform_primitives().values(): # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features) else: matching_inputs = match(input_types, features) if len(matching_inputs) == 0: raise Exception( "Transform Primitive %s not tested" % transform_prim.name) for s in matching_inputs: instance = transform_prim(*s) # try to get name and calculate instance.get_name() calculate_feature_matrix([instance], entityset=es).head(5)
import pandas as pd import pytest import featuretools as ft from featuretools.primitives import get_aggregation_primitives, get_transform_primitives from featuretools.utils.gen_utils import Library UNSUPPORTED = [ p.name for p in get_transform_primitives().values() if Library.DASK not in p.compatibility ] UNSUPPORTED += [ p.name for p in get_aggregation_primitives().values() if Library.DASK not in p.compatibility ] def test_transform(pd_es, dask_es): pytest.skip( "TODO: Dask issue with `series.eq`. Fix once Dask Issue #7957 is closed." ) primitives = ft.list_primitives() trans_list = primitives[primitives["type"] == "transform"]["name"].tolist() trans_primitives = [prim for prim in trans_list if prim not in UNSUPPORTED] agg_primitives = [] cutoff_time = pd.Timestamp("2019-01-05 04:00") assert pd_es == dask_es # Run DFS using each dataframe as a target and confirm results match for df in pd_es.dataframes:
def __init__( self, target_dataframe_name, entityset, agg_primitives=None, trans_primitives=None, where_primitives=None, groupby_trans_primitives=None, max_depth=2, max_features=-1, allowed_paths=None, ignore_dataframes=None, ignore_columns=None, primitive_options=None, seed_features=None, drop_contains=None, drop_exact=None, where_stacking_limit=1, ): if target_dataframe_name not in entityset.dataframe_dict: es_name = entityset.id or "entity set" msg = "Provided target dataframe %s does not exist in %s" % ( target_dataframe_name, es_name, ) raise KeyError(msg) # need to change max_depth to None because DFs terminates when <0 if max_depth == -1: max_depth = None # if just one dataframe, set max depth to 1 (transform stacking rule) if len(entityset.dataframe_dict) == 1 and (max_depth is None or max_depth > 1): warnings.warn( "Only one dataframe in entityset, changing max_depth to " "1 since deeper features cannot be created") max_depth = 1 self.max_depth = max_depth self.max_features = max_features self.allowed_paths = allowed_paths if self.allowed_paths: self.allowed_paths = set() for path in allowed_paths: self.allowed_paths.add(tuple(path)) if ignore_dataframes is None: self.ignore_dataframes = set() else: if not isinstance(ignore_dataframes, list): raise TypeError("ignore_dataframes must be a list") assert (target_dataframe_name not in ignore_dataframes), "Can't ignore target_dataframe!" self.ignore_dataframes = set(ignore_dataframes) self.ignore_columns = defaultdict(set) if ignore_columns is not None: # check if ignore_columns is not {str: list} if not all(isinstance(i, str) for i in ignore_columns.keys()) or not all( isinstance(i, list) for i in ignore_columns.values()): raise TypeError("ignore_columns should be dict[str -> list]") # check if list values are all of type str elif not all( all(isinstance(v, str) for v in value) for value in ignore_columns.values()): raise TypeError("list values should be of type str") for df_name, cols in ignore_columns.items(): self.ignore_columns[df_name] = set(cols) self.target_dataframe_name = target_dataframe_name self.es = entityset for library in Library: if library.value == self.es.dataframe_type: df_library = library break aggregation_primitive_dict = primitives.get_aggregation_primitives() transform_primitive_dict = primitives.get_transform_primitives() if agg_primitives is None: agg_primitives = [ p for p in primitives.get_default_aggregation_primitives() if df_library in p.compatibility ] self.agg_primitives = [] self.agg_primitives = sorted([ check_primitive( p, "aggregation", aggregation_primitive_dict, transform_primitive_dict, ) for p in agg_primitives ]) if trans_primitives is None: trans_primitives = [ p for p in primitives.get_default_transform_primitives() if df_library in p.compatibility ] self.trans_primitives = sorted([ check_primitive(p, "transform", aggregation_primitive_dict, transform_primitive_dict) for p in trans_primitives ]) if where_primitives is None: where_primitives = [primitives.Count] self.where_primitives = sorted([ check_primitive(p, "where", aggregation_primitive_dict, transform_primitive_dict) for p in where_primitives ]) if groupby_trans_primitives is None: groupby_trans_primitives = [] self.groupby_trans_primitives = sorted([ check_primitive( p, "groupby transform", aggregation_primitive_dict, transform_primitive_dict, ) for p in groupby_trans_primitives ]) if primitive_options is None: primitive_options = {} all_primitives = (self.trans_primitives + self.agg_primitives + self.where_primitives + self.groupby_trans_primitives) bad_primitives = [ prim.name for prim in all_primitives if df_library not in prim.compatibility ] if bad_primitives: msg = "Selected primitives are incompatible with {} EntitySets: {}" raise ValueError( msg.format(df_library.value, ", ".join(bad_primitives))) ( self.primitive_options, self.ignore_dataframes, self.ignore_columns, ) = generate_all_primitive_options( all_primitives, primitive_options, self.ignore_dataframes, self.ignore_columns, self.es, ) self.seed_features = sorted(seed_features or [], key=lambda f: f.unique_name()) self.drop_exact = drop_exact or [] self.drop_contains = drop_contains or [] self.where_stacking_limit = where_stacking_limit
def test_trans_primitives_can_init_without_params(): trans_primitives = get_transform_primitives().values() for trans_primitive in trans_primitives: trans_primitive()