def list_primitives(): try: with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', -1, 'display.width', 1000): print(featuretools.list_primitives()) except ValueError: with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None, 'display.width', 1000): print(featuretools.list_primitives())
def list_primitives(): try: with pd.option_context( "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", -1, "display.width", 1000, ): print(featuretools.list_primitives()) except ValueError: with pd.option_context( "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None, "display.width", 1000, ): print(featuretools.list_primitives())
def list_feature_primitives(self): """Returns built-in primitive in Featuretools. Returns: A pandas dataframe that lists and describes each built-in primitives. """ return ft.list_primitives()
def test_aggregation(pd_es, dask_es): primitives = ft.list_primitives() trans_primitives = [] agg_list = primitives[primitives['type'] == 'aggregation']['name'].tolist() agg_primitives = [prim for prim in agg_list if prim not in UNSUPPORTED] assert pd_es == dask_es # Run DFS using each entity as a target and confirm results match for entity in pd_es.entities: fm, _ = ft.dfs(entityset=pd_es, target_entity=entity.id, trans_primitives=trans_primitives, agg_primitives=agg_primitives, cutoff_time=pd.Timestamp("2019-01-05 04:00"), max_depth=2) dask_fm, _ = ft.dfs(entityset=dask_es, target_entity=entity.id, trans_primitives=trans_primitives, agg_primitives=agg_primitives, cutoff_time=pd.Timestamp("2019-01-05 04:00"), max_depth=2) # Use the same columns and make sure both indexes are sorted the same dask_computed_fm = dask_fm.compute().set_index( entity.index).loc[fm.index][fm.columns] pd.testing.assert_frame_equal(fm, dask_computed_fm, check_dtype=False)
def add_agg_primitives(self, agg): '''Appends items from agg to the aggregate primitives to be used in DFS. Aggregate primitives must be available in Featuretools library. agg: list of string values ''' aggs_list = ft.list_primitives().loc[ft.list_primitives()['type'] == 'aggregation'] for i in agg: if i in aggs_list['name'].values: self.agg_primitives.append(i) else: print(i, "is not in the available aggregate primitives.") print("The aggregate primitives have been added: ", *self.agg_primitives)
def add_trans_primitives(self, trans): '''Appends items from trans to the transformative primitives to be used in DFS. Transform primitives must be available in Featuretools library. trans: list of string values ''' tran_list = ft.list_primitives().loc[ft.list_primitives()['type'] == 'transform'] for i in trans: if i in tran_list['name'].values: self.trans_primitives.append(i) else: print(i, "is not in the available transform primitives.") print("The transformative primitives have been added: ", *self.trans_primitives)
def _get_primitive_hyperparams(): """Get one boolean hyperparam object for each available featuretools primitive. The hyperparameter will be named {primitimve_type}_{primitive_name}, will have the primitive description, and will default to True or False depending on whether the primitive name is in the _DEAFULT_PRIMITIVES list. An example of such a primitive is:: aggregation_max = hyperparams.Hyperparameter[bool]( description='Finds the maximum non-null value of a numeric feature.', default=True, semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] ) Returns: dict containing hyperparameter names as keys and hyperparameter objects as values. """ primitive_hyperparams = dict() primitives = ft.list_primitives() for _, primitive in primitives.iterrows(): primitive_name = primitive['name'] if primitive_name in ALL_PRIMITIVES: hyperparam_name = '{}_{}'.format(primitive['type'], primitive_name) hyperparam = hyperparams.Hyperparameter[bool]( default=primitive_name in DEFAULT_PRIMITIVES, description=primitive['description'], semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter' ]) primitive_hyperparams[hyperparam_name] = hyperparam return dict(sorted(primitive_hyperparams.items()))
def test_aggregation(pd_es, dask_es): primitives = ft.list_primitives() trans_primitives = [] agg_list = primitives[primitives['type'] == 'aggregation']['name'].tolist() agg_primitives = [prim for prim in agg_list if prim not in UNSUPPORTED] assert pd_es == dask_es # Run DFS using each dataframe as a target and confirm results match for df in pd_es.dataframes: fm, _ = ft.dfs(entityset=pd_es, target_dataframe_name=df.ww.name, trans_primitives=trans_primitives, agg_primitives=agg_primitives, cutoff_time=pd.Timestamp("2019-01-05 04:00"), max_depth=2) dask_fm, _ = ft.dfs(entityset=dask_es, target_dataframe_name=df.ww.name, trans_primitives=trans_primitives, agg_primitives=agg_primitives, cutoff_time=pd.Timestamp("2019-01-05 04:00"), max_depth=2) # Categorical categories can be ordered differently, this makes sure they are the same dask_fm = dask_fm.astype(fm.dtypes) # Use the same columns and make sure both indexes are sorted the same dask_computed_fm = dask_fm.compute().set_index( df.ww.index).loc[fm.index][fm.columns] pd.testing.assert_frame_equal(fm, dask_computed_fm)
def listprimitives(): # List the primitives in a dataframe # primitives = ft.list_primitives() # pd.options.display.max_colwidth = 100 # primitives[primitives['type'] == 'aggregation'].head(10) primitives = ft.list_primitives() #pd.options.display.max_colwidth = 100 return primitives[primitives['type'] == 'transform'].head(78) return primitives[primitives['type'] == 'aggregation'].head(22)
def es_set(self): print("Generating Features...\n") # List the primitives in a dataframe primitives = ft.list_primitives() pd.options.display.max_colwidth = 100 print("feature primitives:", primitives[primitives['type'] == 'aggregation'].head(10)) self.__feature_matrix, self.__feature_defs = ft.dfs( entityset=self.__es, target_entity="app", verbose=True) return self.__feature_matrix
def test_transform(pd_es, dask_es): pytest.skip( "TODO: Dask issue with `series.eq`. Fix once Dask Issue #7957 is closed." ) primitives = ft.list_primitives() trans_list = primitives[primitives["type"] == "transform"]["name"].tolist() trans_primitives = [prim for prim in trans_list if prim not in UNSUPPORTED] agg_primitives = [] cutoff_time = pd.Timestamp("2019-01-05 04:00") assert pd_es == dask_es # Run DFS using each dataframe as a target and confirm results match for df in pd_es.dataframes: features = ft.dfs( entityset=pd_es, target_dataframe_name=df.ww.name, trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=2, features_only=True, ) dask_features = ft.dfs( entityset=dask_es, target_dataframe_name=df.ww.name, trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=2, features_only=True, ) assert features == dask_features # Calculate feature matrix values to confirm output is the same between dask and pandas. # Not testing on all returned features due to long run times. fm = ft.calculate_feature_matrix(features=features[:100], entityset=pd_es, cutoff_time=cutoff_time) dask_fm = ft.calculate_feature_matrix(features=dask_features[:100], entityset=dask_es, cutoff_time=cutoff_time) # Categorical categories can be ordered differently, this makes sure they are the same dask_fm = dask_fm.astype(fm.dtypes) # Use the same columns and make sure both indexes are sorted the same dask_computed_fm = (dask_fm.compute().set_index( df.ww.index).loc[fm.index][fm.columns]) pd.testing.assert_frame_equal(fm, dask_computed_fm)
def test_list_primitives_order(): df = list_primitives() all_primitives = get_transform_primitives() all_primitives.update(get_aggregation_primitives()) for name, primitive in all_primitives.items(): assert name in df['name'].values row = df.loc[df['name'] == name].iloc[0] actual_desc = _get_descriptions([primitive])[0] if actual_desc: assert actual_desc == row['description'] types = df['type'].values assert 'aggregation' in types assert 'transform' in types
def add_where_primitives(self, where): '''Appends items from where to the where primitives to be used in DFS. Where primitives are used on specified interesting_values to build conditional features and can take on aggregate or transform primitives. where: list of string values ''' all_prims = ft.list_primitives()['name'].values for i in where: if i in all_prims: self.where_primitives.append(i) else: print(i, "is not in the available primitives") print("The where primitives have been added: ", *self.where_primitives)
def test_list_primitives_order(): df = list_primitives() all_primitives = get_transform_primitives() all_primitives.update(get_aggregation_primitives()) for name, primitive in all_primitives.items(): assert name in df["name"].values row = df.loc[df["name"] == name].iloc[0] actual_desc = _get_descriptions([primitive])[0] if actual_desc: assert actual_desc == row["description"] assert row["dask_compatible"] == (Library.DASK in primitive.compatibility) assert row["valid_inputs"] == ", ".join( _get_unique_input_types(primitive.input_types) ) assert row["return_type"] == getattr(primitive.return_type, "__name__", None) types = df["type"].values assert "aggregation" in types assert "transform" in types
def test_list_primitives_order(): df = list_primitives() all_primitives = get_transform_primitives() all_primitives.update(get_aggregation_primitives()) for name, primitive in all_primitives.items(): assert name in df['name'].values row = df.loc[df['name'] == name].iloc[0] actual_desc = _get_descriptions([primitive])[0] if actual_desc: assert actual_desc == row['description'] assert row['dask_compatible'] == (Library.DASK in primitive.compatibility) assert row['valid_inputs'] == ', '.join( _get_names_valid_inputs(primitive.input_types)) assert row['return_type'] == getattr(primitive.return_type, '__name__', None) types = df['type'].values assert 'aggregation' in types assert 'transform' in types
def test_transform(pd_es, dask_es): primitives = ft.list_primitives() trans_list = primitives[primitives['type'] == 'transform']['name'].tolist() trans_primitives = [prim for prim in trans_list if prim not in UNSUPPORTED] agg_primitives = [] cutoff_time = pd.Timestamp("2019-01-05 04:00") assert pd_es == dask_es # Run DFS using each entity as a target and confirm results match for entity in pd_es.entities: features = ft.dfs(entityset=pd_es, target_entity=entity.id, trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=2, features_only=True) dask_features = ft.dfs(entityset=dask_es, target_entity=entity.id, trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=2, features_only=True) assert features == dask_features # Calculate feature matrix values to confirm output is the same between dask and pandas. # Not testing on all returned features due to long run times. fm = ft.calculate_feature_matrix(features=features[:100], entityset=pd_es, cutoff_time=cutoff_time) dask_fm = ft.calculate_feature_matrix(features=dask_features[:100], entityset=dask_es, cutoff_time=cutoff_time) # Use the same columns and make sure both indexes are sorted the same dask_computed_fm = dask_fm.compute().set_index( entity.index).loc[fm.index][fm.columns] pd.testing.assert_frame_equal(fm, dask_computed_fm)
# python 2 from funcsigs import signature import featuretools as ft import pytest from featuretools import ( calculate_feature_matrix, dfs, list_primitives, load_features, save_features, ) from featuretools.tests.testing_utils import make_ecommerce_entityset ft.primitives._load_primitives() PRIMITIVES = list_primitives() class PrimitiveT: primitive = None @pytest.fixture(autouse=True, scope="session") def es(self): es = make_ecommerce_entityset() return es def test_name_and_desc(self): assert self.primitive.name is not None assert self.primitive.__doc__ is not None docstring = self.primitive.__doc__ short_description = docstring.splitlines()[0]
import pandas as pd import numpy as np import featuretools as ft if __name__ == "__main__": dataset = load_iris() X = dataset.data y = dataset.target iris_feature_names = dataset.feature_names df = pd.DataFrame(X, columns=iris_feature_names) es = ft.EntitySet(id='single_dataframe') # 用id标识实体集 # 增加一个数据框,命名为iris es.entity_from_dataframe(entity_id='iris', dataframe=df, index='index', make_index=True) trans_primitives=['add_numeric', 'subtract_numeric', ,'multiply_numeric', 'divide_numeric'] # 2列相加减乘除来生成新特征 feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='iris', max_depth=1, # max_depth=1,只在原特征上进行运算产生新特征 verbose=1, trans_primitives=trans_primitives ) ft.list_primitives() # 查看可使用的特征集元 # features_df = pd.DataFrame(feature_matrix, columns= feature_names) # print(features_df.head()) print(feature_matrix)
class FTTimeSeriesBuilder: """ Scikit-learn-style feature builder based on featuretools. Args: num_features: The (maximum) number of features to build. memory: How much back in time you want to go until the feature builder starts "forgetting" data. column_id: The name of the column containing the ids. time_stamp: The name of the column containing the time stamps. target: The name of the target column. """ all_primitives = ft.list_primitives() agg_primitives = all_primitives[all_primitives.type == "aggregation"].name.tolist() trans_primitives = all_primitives[all_primitives.type == "transform"].name.tolist() def __init__( self, num_features, horizon, memory, column_id, time_stamp, target, allow_lagged_targets=False, ): self.num_features = num_features self.horizon = horizon self.memory = memory self.column_id = column_id self.time_stamp = time_stamp self.target = target self.allow_lagged_targets = allow_lagged_targets self._runtime = None self.fitted = False self.max_depth = 2 self.selected_features = [] def _extract_features(self, data_frame): data_frame = data_frame.reset_index() del data_frame["index"] rolled = _roll_data_frame(data_frame, self.column_id, self.time_stamp, self.horizon, self.memory) data_frame["_featuretools_index"] = np.arange(data_frame.shape[0]) entityset = _make_entity_set(data_frame, rolled, self.time_stamp) df_extracted, _ = ft.dfs( entityset=entityset, agg_primitives=self.agg_primitives, target_dataframe_name="population", max_depth=self.max_depth, ignore_columns={ "peripheral": [ self.column_id, "index", "join_key", "_featuretools_join_key", "_featuretools_index", ] }, ) for col in df_extracted: if is_numeric_dtype(df_extracted[col]): df_extracted[col][df_extracted[col].isna()] = 0 return df_extracted def _select_features(self, data_frame, target): colnames = np.asarray(data_frame.columns) print("Selecting the best out of " + str(len(colnames)) + " features...") colnames = np.asarray([ col for col in colnames if is_numeric_dtype(data_frame[col]) and np.var(np.asarray(data_frame[col])) > 0.0 ]) correlations = np.asarray( [np.abs(pearsonr(target, data_frame[col]))[0] for col in colnames]) correlations[np.isnan(correlations) | np.isinf(correlations)] = 0.0 self.selected_features = colnames[np.argsort( correlations)][::-1][:self.num_features] return data_frame[self.selected_features] def fit(self, data_frame): """ Fits the DFS on the data frame and returns the features for the training set. """ print("featuretools: Trying features...") begin = time.time() target = np.asarray(data_frame[self.target]) df_for_extraction = (data_frame if self.allow_lagged_targets else _remove_target_column(data_frame, self.target)) df_extracted = self._extract_features(df_for_extraction) df_selected = self._select_features(df_extracted, target) df_selected = _add_original_columns(data_frame, df_selected) end = time.time() _print_time_taken(begin, end) self.fitted = True self._runtime = datetime.timedelta(seconds=end - begin) return df_selected @property def runtime(self): if self.fitted: return self._runtime def transform(self, data_frame): """ Fits the DFS on the data frame and returns the features for the training set. """ df_for_extraction = (data_frame if self.allow_lagged_targets else _remove_target_column(data_frame, self.target)) df_extracted = self._extract_features(df_for_extraction) df_selected = df_extracted[self.selected_features] df_selected = _add_original_columns(data_frame, df_selected) return df_selected
def test_not_duplicate_of_default(self): class_name = self.primitive.__name__ df = list_primitives() primitive_names = df['name'].apply(convert).tolist() assert class_name not in primitive_names
agg_primitives=['count', 'mean' ], # specified, otherwise defaults primitives will be used max_depth=1) print(feature_matrix.columns.tolist()) print(feature_matrix.head()) print(feature_defs) print('-----------encode category feature-----------') feature_matrix_enc, feature_enc = ft.encode_features(feature_matrix, feature_defs) print(feature_matrix_enc.columns.tolist()) print(feature_matrix_enc.head()) print(feature_enc) print('-----------list primitives---------------------') print(ft.list_primitives().head()) print('----------custom primitives----------------------') from featuretools.primitives import make_agg_primitive, make_trans_primitive from featuretools.variable_types import Text, Numeric def absolute(column): return abs(column) Absolute = make_trans_primitive(function=absolute, input_types=[Numeric], return_type=Numeric)
def list_primitives(): with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', -1, 'display.width', 1000): print(featuretools.list_primitives())
# Now we have to tell ft how these entities are related # Uses the parent-child metaphor. Create the relationships then add to the entity set r_c_l = ft.Relationship(es['clients']['client_id'], es['loans']['client_id']) r_l_p = ft.Relationship(es['loans']['loan_id'], es['payments']['loan_id']) es = es.add_relationship(r_c_l) es = es.add_relationship(r_l_p) # Another look at the whole lot es # Ok Now lets make some features. First some primitives # Either aggregations or transformations primitives = ft.list_primitives() pd.options.display.max_colwidth=100 primitives[primitives['type']=='aggregation'].head(20) # And primitives[primitives['type']=='transform'].head(20) # Ok lets do it! Make some features for the clients features, feature_names = ft.dfs(entityset = es, target_entity='clients', agg_primitives=['median','mean','std','max','percent_true','last','time_since_last'], trans_primitives=['years','month','divide']) # Wow! After setup (which could be done in a library call) # we have 408 features with 4 lines of code (140 new ones)
# Course Code: DLBDSMLUSL01 # Automated feature generation #%% import libraries import pandas as pd import featuretools as ft #%% Remove any limit on the number of columns to display pd.options.display.max_columns = None #%% Remove any limit on the number of rows to display pd.options.display.max_rows = None #%% Display the list of primitives print(ft.list_primitives()) #%% create sample data Customers = pd.DataFrame({ \ 'C_ID': ['C1', 'C2'], \ 'Name': ['Martin', 'Julia'], \ 'Creation_date': ['2018-08-15', '2020-05-05']}, \ columns = ['C_ID','Name','Creation_date']) Orders = pd.DataFrame({ \ 'Ord_ID': ['1', '2', '3', '4', '5'], \ 'C_ID': ['C1', 'C2', 'C1', 'C1','C2']}, \ columns = ['Ord_ID','C_ID']) Payments = pd.DataFrame({ \ 'Ord_ID':['1', '5', '3', '4', '2'], \ 'Price':[500, 200, 300, 100, 900]}, \ columns = ['Ord_ID', 'Price'])