def test_custom_primitive_multiple_inputs(es): def mean_sunday(numeric, datetime): ''' Finds the mean of non-null values of a feature that occurred on Sundays ''' days = pd.DatetimeIndex(datetime).weekday.values df = pd.DataFrame({'numeric': numeric, 'time': days}) return df[df['time'] == 6]['numeric'].mean() MeanSunday = make_agg_primitive(function=mean_sunday, input_types=[Numeric, Datetime], return_type=Numeric) fm, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[]) mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None]) iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value) for x, y in iterator: assert ((pd.isnull(x) and pd.isnull(y)) or (x == y)) es.add_interesting_values() mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None]) fm, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[], where_primitives=[MeanSunday]) where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)" for x, y in zip(fm[where_feat], mean_sunday_value_priority_0): assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_transform_consistency(): # Create dataframe df = pd.DataFrame({'a': [14, 12, 10], 'b': [False, False, True], 'b1': [True, True, False], 'b12': [4, 5, 6], 'P': [10, 15, 12]}) es = ft.EntitySet(id='test') # Add dataframe to entityset es.entity_from_dataframe(entity_id='first', dataframe=df, index='index', make_index=True) # Generate features feature_defs = ft.dfs(entityset=es, target_entity='first', trans_primitives=['and', 'add', 'or'], features_only=True) # Check for correct ordering of features assert feature_with_name(feature_defs, 'a') assert feature_with_name(feature_defs, 'b') assert feature_with_name(feature_defs, 'b1') assert feature_with_name(feature_defs, 'b12') assert feature_with_name(feature_defs, 'P') assert feature_with_name(feature_defs, 'AND(b, b1)') assert not feature_with_name(feature_defs, 'AND(b1, b)') # make sure it doesn't exist the other way assert feature_with_name(feature_defs, 'a + P') assert feature_with_name(feature_defs, 'b12 + P') assert feature_with_name(feature_defs, 'a + b12') assert feature_with_name(feature_defs, 'OR(b, b1)') assert feature_with_name(feature_defs, 'OR(AND(b, b1), b)') assert feature_with_name(feature_defs, 'OR(AND(b, b1), b1)')
def dfs_run(self): self.__train_feature, _ = ft.dfs( entityset=self.__es, target_entity="application_train", agg_primitives=[Sum, Std, Max, Min, Median, Count, Skew, PercentTrue, Trend, AvgTimeBetween], where_primitives=[Std, Max, Min, Median, Count], verbose=True, chunk_size=150, # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间 ) self.__train_feature.to_csv(os.path.join(self.__output_path, "train_agg_df.csv"), index=True)
def test_encode_unknown_features(): # Dataframe with categorical column with "unknown" string df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']}) es = EntitySet('test') es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=es, target_entity='a') # Specify unknown token for replacement features_enc, feature_defs_enc = encode_features(features, feature_defs, include_unknown=True) assert list(features_enc.columns) == ['category = unknown', 'category = e', 'category = d', 'category = c', 'category = b', 'category is unknown']
def test_pickle_features_with_custom_primitive(es): NewMax = make_agg_primitive( lambda x: max(x), name="NewMax", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") features_original = ft.dfs(target_entity='sessions', entityset=es, agg_primitives=["Last", "Mean", NewMax], features_only=True) assert any( [isinstance(feat.primitive, NewMax) for feat in features_original]) pickle_features_test_helper(asizeof(es), features_original)
def test_seed_multi_output_feature_stacking(es): threecommon = NMostCommon(3) tc = ft.Feature(es['log']['product_id'], parent_entity=es["sessions"], primitive=threecommon) fm, feat = ft.dfs(entityset=es, target_entity="customers", seed_features=[tc], agg_primitives=[NumUnique], trans_primitives=[], max_depth=4) for i in range(3): f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.product_id)[%d])' % i assert feature_with_name(feat, f)
def fit(self, X, y=None, **kwargs): self.original_cols = X.columns.to_list() if self.selection_args is not None: assert y is not None, '`y` must be provided for feature selection.' self.selection_args['reserved_cols'] = self.original_cols self.selection_transformer = FeatureSelectionTransformer(task=self.task, **self.selection_args) # self._check_values(X) if self.continuous_cols is None: self.continuous_cols = column_number_exclude_timedelta(X) if self.datetime_cols is None: self.datetime_cols = column_all_datetime(X) if self.fix_input: _mean = X[self.continuous_cols].mean().to_dict() _mode = X[self.datetime_cols].mode().to_dict() self._imputed_input = {} self._merge_dict(self._imputed_input, _mean, _mode) self._replace_invalid_values(X, self._imputed_input) feature_type_dict = {} self._merge_dict(feature_type_dict, {c: variable_types.Numeric for c in self.continuous_cols}, {c: variable_types.Datetime for c in self.datetime_cols}) es = ft.EntitySet(id='es_hypernets_fit') es.entity_from_dataframe(entity_id='e_hypernets_ft', dataframe=X, variable_types=feature_type_dict, make_index=True, index=self.ft_index) feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="e_hypernets_ft", ignore_variables={"e_hypernets_ft": []}, return_variable_types="all", trans_primitives=self.trans_primitives, max_depth=self.max_depth, features_only=False, max_features=-1) X.pop(self.ft_index) self.feature_defs_ = feature_defs if self.selection_transformer is not None: self.selection_transformer.fit(feature_matrix, y) selected_defs = [] for fea in self.feature_defs_: if fea._name in self.selection_transformer.columns_: selected_defs.append(fea) self.feature_defs_ = selected_defs return self
def test_make_transform_multiple_output_features(es): def test_time(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] def gen_feat_names(self): subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"] return [ "Now.%s(%s)" % (subname, self.base_features[0].get_name()) for subname in subnames ] TestTime = make_trans_primitive( function=test_time, input_types=[Datetime], return_type=Numeric, number_output_features=6, cls_attributes={"get_feature_names": gen_feat_names}, ) join_time_split = ft.Feature(es["log"]["datetime"], primitive=TestTime) alt_features = [ ft.Feature(es["log"]["datetime"], primitive=Year), ft.Feature(es["log"]["datetime"], primitive=Month), ft.Feature(es["log"]["datetime"], primitive=Day), ft.Feature(es["log"]["datetime"], primitive=Hour), ft.Feature(es["log"]["datetime"], primitive=Minute), ft.Feature(es["log"]["datetime"], primitive=Second) ] fm, fl = ft.dfs(entityset=es, target_entity="log", agg_primitives=[], trans_primitives=[ TestTime, Year, Month, Day, Hour, Minute, Second, Diff ], max_depth=5) subnames = join_time_split.get_feature_names() altnames = [f.get_name() for f in alt_features] for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all() for i in range(6): f = 'sessions.customers.DIFF(TEST_TIME(date_of_birth)[%d])' % i assert feature_with_name(fl, f) assert ('DIFF(TEST_TIME(datetime)[%d])' % i) in fl
def test_make_three_most_common(pd_es): class NMostCommoner(AggregationPrimitive): name = "pd_top3" input_types = ([ColumnSchema(semantic_tags={"category"})],) return_type = None number_output_features = 3 def get_function(self): def pd_top3(x): counts = x.value_counts() counts = counts[counts > 0] array = np.array(counts[:3].index) if len(array) < 3: filler = np.full(3 - len(array), np.nan) array = np.append(array, filler) return array return pd_top3 fm, features = ft.dfs( entityset=pd_es, target_dataframe_name="customers", instance_ids=[0, 1, 2], agg_primitives=[NMostCommoner], trans_primitives=[], ) df = fm[["PD_TOP3(log.product_id)[%s]" % i for i in range(3)]] assert set(df.iloc[0].values[:2]) == set( ["coke zero", "toothpaste"] ) # coke zero and toothpaste have same number of occurrences assert df.iloc[0].values[2] in [ "car", "brown bag", ] # so just check that the top two match assert ( df.iloc[1] .reset_index(drop=True) .equals(pd.Series(["coke zero", "Haribo sugar-free gummy bears", np.nan])) ) assert ( df.iloc[2] .reset_index(drop=True) .equals(pd.Series(["taco clock", np.nan, np.nan])) )
def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None): if not entities and not entityset: target_entity = 'X' else: target_entity = target_entity or self.target_entity if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) if self.training_window is not None: entityset.add_last_time_indexes() cutoff_time = None if self.time_index: cutoff_time = X[[self.index, self.time_index]] cutoff_time = cutoff_time.rename(columns={self.time_index: 'time'}) self.features = ft.dfs( cutoff_time=cutoff_time, max_depth=self.max_depth, entityset=entityset, target_entity=target_entity, features_only=True, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, max_features=self.max_features, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) if self.encode or self.remove_low_information: X = ft.calculate_feature_matrix( self.features, entityset=entityset, cutoff_time=cutoff_time, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) if self.encode: X, self.features = ft.encode_features(X, self.features) if self.remove_low_information: X, self.features = remove_low_information_features(X, self.features)
def _feature_summary_data(self): raceuma_df = self.base_df[[ "RACE_KEY", "UMABAN", "激走指数", "馬スタート指数", "馬出遅率", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "テンF指数", "中間F指数", "終いF指数", "コーナー順位3_1", "コーナー順位4_1", "前3F先頭差_1", "後3F先頭差_1", "レース脚質_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "追込率_1", "コーナー順位3_2", "コーナー順位4_2", "前3F先頭差_2", "後3F先頭差_2", "レース脚質_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "追込率_2", "コーナー順位3_3", "コーナー順位4_3", "前3F先頭差_3", "後3F先頭差_3", "レース脚質_3", "テン指数結果_3", "上がり指数結果_3", "ペース指数結果_3", "レースP指数結果_3", "追込率_3", "コーナー順位3_4", "コーナー順位4_4", "前3F先頭差_4", "後3F先頭差_4", "レース脚質_4", "テン指数結果_4", "上がり指数結果_4", "ペース指数結果_4", "レースP指数結果_4", "追込率_4", "コーナー順位3_5", "コーナー順位4_5", "前3F先頭差_5", "後3F先頭差_5", "レース脚質_5", "テン指数結果_5", "上がり指数結果_5", "ペース指数結果_5", "レースP指数結果_5", "追込率_5" ]] raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df["RACE_KEY"].astype( str).str.cat(raceuma_df["UMABAN"].astype(str)) raceuma_df.drop("UMABAN", axis=1, inplace=True) es = ft.EntitySet(id="race") es.entity_from_dataframe( entity_id='race', dataframe=self.ld.race_df[["RACE_KEY", "target_date"]], index="RACE_KEY") es.entity_from_dataframe(entity_id='raceuma', dataframe=raceuma_df, index="RACE_UMA_KEY") relationship = ft.Relationship(es['race']["RACE_KEY"], es['raceuma']["RACE_KEY"]) es = es.add_relationship(relationship) print(es) # 集約関数 aggregation_list = ['mean', 'skew'] transform_list = [] # run dfs print("un dfs") feature_matrix, features_dfs = ft.dfs(entityset=es, target_entity='race', agg_primitives=aggregation_list, trans_primitives=transform_list, max_depth=2) feature_summary_df = pd.merge(feature_matrix, self.ld.race_df, on=["RACE_KEY", "target_date"]) print("_create_feature: feature_summary_df", feature_summary_df.shape) return feature_summary_df
def generate_features(data, var_types, trans_primitives=["multiply", 'divide', "diff"], N_FEATURES=1000, index_col_name="id"): data = data.copy() print("-" * 15) start_columns = data.columns data = data.reset_index() data[index_col_name] = data[index_col_name].astype(np.int64) N_FEATURES += data.shape[1] es = ft.EntitySet(id='players') main_entity_id = 'train_players' # Entities with a unique index es = es.entity_from_dataframe( entity_id=main_entity_id, dataframe=data, # dataframe object index=index_col_name, # unique index variable_types=var_types) print(es) # DFS with specified primitives print("Start dfs") features, feature_names = ft.dfs( entityset=es, target_entity=main_entity_id, trans_primitives=trans_primitives, agg_primitives=[], max_depth=1, features_only=False, verbose=True, chunk_size=0.5, max_features= N_FEATURES, # comment it later, computational burden reduction n_jobs=-1, ) return features.drop(start_columns, axis=1)
def test_remove_single_value_features(): same_vals_df = pd.DataFrame({ "id": [0, 1, 2, 3], "all_numeric": [88, 88, 88, 88], "with_nan": [1, 1, None, 1], "all_nulls": [None, None, None, None], "all_categorical": ["a", "a", "a", "a"], "all_bools": [True, True, True, True], "diff_vals": ["hi", "bye", "bye", "hi"], }) es = ft.EntitySet("data", {"single_vals": (same_vals_df, "id")}) es["single_vals"].ww.set_types( logical_types={ "all_nulls": "categorical", "all_categorical": "categorical", "diff_vals": "categorical", }) fm, features = ft.dfs( entityset=es, target_dataframe_name="single_vals", trans_primitives=["is_null"], max_depth=2, ) no_params, no_params_features = ft.selection.remove_single_value_features( fm, features) no_params_cols = set(no_params.columns) assert len(no_params_features) == 2 assert "IS_NULL(with_nan)" in no_params_cols assert "diff_vals" in no_params_cols nan_as_value, nan_as_value_features = ft.selection.remove_single_value_features( fm, features, count_nan_as_value=True) nan_cols = set(nan_as_value.columns) assert len(nan_as_value_features) == 3 assert "IS_NULL(with_nan)" in nan_cols assert "diff_vals" in nan_cols assert "with_nan" in nan_cols without_features_param = ft.selection.remove_single_value_features(fm) assert len(no_params.columns) == len(without_features_param.columns) for i in range(len(no_params.columns)): assert no_params.columns[i] == without_features_param.columns[i] assert no_params_features[i].get_name( ) == without_features_param.columns[i]
def get_train_data(project, train_file, prediction_key, prediction_target, variable_types={}, drop_columns=None): # Read the training data print("==========Reading the training file {}".format(train_file)) train_data = pd.read_csv(train_file) train_data.head(5) print("==========Preparing training labels for target {}".format( prediction_target)) train_labels = train_data[prediction_target].values train_data = train_data.drop(prediction_target, axis=1) if drop_columns is not None: print("==========dropping columns {}".format(drop_columns)) train_data = train_data.drop(drop_columns, axis=1) print("==========Generating the feature with featuretools") es = ft.EntitySet(project) entities = get_ft_entities(es=es, project=project, prediction_key=prediction_key, data=train_data, variable_types=variable_types) print("==========entities are:") print(entities) feature_matrix, feature_defs = ft.dfs(entityset=entities, target_entity=project) feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) print("==========columns are:") print(feature_matrix_enc.columns) print("==========saving features to {}".format(project)) ft.save_features(feature_defs, "data/{}/ft_features".format(project)) return feature_matrix_enc, train_labels
def _featuretools_agg(self, methods=['count', 'max', 'mean']): es = ft.EntitySet(id='index') es.entity_from_dataframe(entity_id='data', dataframe=self.data, index='index') for col in self.cols: es.normalize_entity(base_entity_id='data', new_entity_id=col, index=col ) features, _ = ft.dfs(entityset=es, target_entity='data', agg_primitives=methods, max_depth=2, verbose=1, n_jobs=-1) return features
def test_cfm_approximate_correct_ordering(): trips = { 'trip_id': [i for i in range(1000)], 'flight_time': [datetime(1998, 4, 2) for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)], 'flight_id': [randint(1, 25) for i in range(1000)], 'trip_duration': [randint(1, 999) for i in range(1000)] } df = pd.DataFrame.from_dict(trips) es = EntitySet('flights') es.entity_from_dataframe("trips", dataframe=df, index="trip_id", time_index='flight_time') es.normalize_entity(base_entity_id="trips", new_entity_id="flights", index="flight_id", make_time_index=True) features = dfs(entityset=es, target_entity='trips', features_only=True) flight_features = [feature for feature in features if isinstance(feature, DirectFeature) and isinstance(feature.base_features[0], AggregationPrimitive)] property_feature = IdentityFeature(es['trips']['trip_id']) # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'], # es['flights']), # es['trips']) cutoff_time = pd.DataFrame.from_dict({'instance_id': df['trip_id'], 'time': df['flight_time']}) time_feature = IdentityFeature(es['trips']['flight_time']) feature_matrix = calculate_feature_matrix(flight_features + [property_feature, time_feature], cutoff_time_in_index=True, cutoff_time=cutoff_time) feature_matrix.index.names = ['instance', 'time'] assert(np.all(feature_matrix.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix[['trip_id', 'flight_time']].values)) feature_matrix_2 = calculate_feature_matrix(flight_features + [property_feature, time_feature], cutoff_time=cutoff_time, cutoff_time_in_index=True, approximate=Timedelta(2, 'd')) feature_matrix_2.index.names = ['instance', 'time'] assert(np.all(feature_matrix_2.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix_2[['trip_id', 'flight_time']].values)) for column in feature_matrix: for x, y in zip(feature_matrix[column], feature_matrix_2[column]): if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)): import pdb pdb.set_trace() assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def make_features(df1,fea_col): ''' dataframe to make feature columns ''' df_fea = df1[fea_col] es = ft.EntitySet(id = 'sales') es.entity_from_dataframe(entity_id = 'bigmart', dataframe = df_fea, index = 'index') #primitives[primitives['type'] == 'transform'].head(100) #primitives[primitives['type'] == 'aggregation'].head(10) feature_matrix,feature_names = ft.dfs(entityset = es, target_entity ='bigmart', max_depth = 2, agg_primitives=['mean','max','std'], trans_primitives = ['less_than'], verbose = 1, n_jobs = 1) return feature_matrix,feature_names
def build_card_one_hot(): """ Reads in the raw data from train.csv and creates one-hot encodings for the feature and date fields. :return: Data frame with one-hot encoding """ logger = logging.getLogger(__name__) logger.info("Reading in data.") df = pd.read_csv('data/raw/train.csv') df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01") logger.info("Creating entity set") es_train = ft.EntitySet() es_train = es_train.entity_from_dataframe(entity_id='transactions', dataframe=df, index='card_id', time_index="first_active_month", variable_types=CARD_TYPES) feature_matrix, feature_defs = ft.dfs(entityset=es_train, target_entity="transactions") logger.info("Creating one-hot training data") train_feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) ft.save_features(features_enc, "feature_definitions") saved_features = ft.load_features('feature_definitions') logger.info("Creating one-hot test data") df = pd.read_csv('data/raw/test.csv') df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01") df['target'] = 0 es_test = ft.EntitySet() es_test = es_test.entity_from_dataframe(entity_id='transactions', dataframe=df, index='card_id', time_index="first_active_month", variable_types=CARD_TYPES) test_feature_matrix_enc = ft.calculate_feature_matrix( saved_features, es_test) test_feature_matrix_enc.drop(columns='target', inplace=True) return train_feature_matrix_enc, test_feature_matrix_enc
def test_encode_features_topn(pd_es): topn = Feature(Feature(pd_es['log'].ww['product_id']), parent_dataframe_name='customers', primitive=NMostCommon(n=3)) features, feature_defs = dfs(entityset=pd_es, instance_ids=[0, 1, 2], target_dataframe_name="customers", agg_primitives=[NMostCommon(n=3)]) features_enc, feature_defs_enc = encode_features(features, feature_defs, include_unknown=True) assert topn.unique_name() in [ feat.unique_name() for feat in feature_defs_enc ] for name in topn.get_feature_names(): assert name in features_enc.columns assert features_enc.columns.tolist().count(name) == 1
def valid_dfs( es, aggregations, transforms, feature_substrings, target_dataframe_name="log", multi_output=False, max_depth=3, max_features=-1, instance_ids=[0, 1, 2, 3], ): if not isinstance(feature_substrings, list): feature_substrings = [feature_substrings] features = dfs( entityset=es, target_dataframe_name=target_dataframe_name, agg_primitives=aggregations, trans_primitives=transforms, max_features=max_features, max_depth=max_depth, features_only=True, ) applicable_features = [] for feat in features: for x in feature_substrings: if x in feat.get_name(): applicable_features.append(feat) if len(applicable_features) == 0: raise ValueError("No feature names with %s, verify the name attribute \ is defined and/or generate_name() is defined to \ return %s " % (feature_substrings, feature_substrings)) df = ft.calculate_feature_matrix(entityset=es, features=applicable_features, instance_ids=instance_ids) ft.encode_features(df, applicable_features) # TODO: check the multi_output shape by checking # feature.number_output_features for each feature # and comparing it with the matrix shape if not multi_output: assert len(applicable_features) == df.shape[1] return
def _create_feature(self): """ マージしたデータから特徴量を生成する """ print("_create_feature") raceuma_df = self.base_df[["RACE_KEY", "UMABAN", "脚質", "距離適性", "父馬産駒連対平均距離", "母父馬産駒連対平均距離", "IDM", "テン指数", "ペース指数", "上がり指数", "位置指数", "IDM結果_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "先行率_1", "追込率_1", "fa_1_1", "fa_2_1", "fa_3_1", "fa_4_1", "fa_5_1"]] raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df["RACE_KEY"] + raceuma_df["UMABAN"] raceuma_df.drop("UMABAN", axis=1, inplace=True) # https://qiita.com/daigomiyoshi/items/d6799cc70b2c1d901fb5 es = ft.EntitySet(id="race") es.entity_from_dataframe(entity_id='race', dataframe=self.ld.race_df.drop("NENGAPPI", axis=1), index="RACE_KEY") es.entity_from_dataframe(entity_id='raceuma', dataframe=raceuma_df, index="RACE_UMA_KEY") relationship = ft.Relationship(es['race']["RACE_KEY"], es['raceuma']["RACE_KEY"]) es = es.add_relationship(relationship) print(es) # 集約関数 aggregation_list = ['min', 'max', 'mean', 'skew', 'percent_true'] transform_list = [] # run dfs print("un dfs") feature_matrix, features_dfs = ft.dfs(entityset=es, target_entity='race', agg_primitives=aggregation_list, trans_primitives=transform_list, max_depth=2) print("_create_feature: feature_matrix", feature_matrix.shape) # 予想1番人気のデータを取得 ninki_df = self.base_df.query("基準人気順位==1")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "情報印", "騎手印", "厩舎印", "調教印", "激走印", "展開記号", "輸送区分", "騎手期待単勝率", "騎手期待3着内率", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "放牧先ランク", "厩舎ランク", "調教量評価", "仕上指数変化", "調教評価", "IDM", "騎手指数", "情報指数", "総合指数", "人気指数", "調教指数", "厩舎指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "IDM結果_1", "IDM結果_2"]].add_prefix("人気_").rename(columns={"人気_RACE_KEY":"RACE_KEY"}) # 逃げ予想馬のデータを取得 nige_df = self.base_df.query("展開記号=='1'")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "斤量_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "斤量_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "先行率_1", "先行率_2"]].add_prefix("逃げ_").rename(columns={"逃げ_RACE_KEY":"RACE_KEY"}) # 上がり最速予想馬のデータを取得 agari_df = self.base_df.query("展開記号=='2'")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "斤量_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "斤量_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "先行率_1", "先行率_2"]].add_prefix("上り_").rename(columns={"上り_RACE_KEY":"RACE_KEY"}) self.base_df = pd.merge(feature_matrix, nige_df, on="RACE_KEY", how="left") self.base_df = pd.merge(self.base_df, agari_df, on="RACE_KEY", how="left") self.base_df = pd.merge(self.base_df, ninki_df, on="RACE_KEY") self.base_df = pd.merge(self.base_df, self.ld.race_df[["RACE_KEY", "NENGAPPI"]], on="RACE_KEY")
def test_time_since_primitive_matches_all_datetime_types(es): if ks and any(isinstance(e.df, ks.DataFrame) for e in es.entities): pytest.xfail( 'TimeSince transform primitive is incompatible with Koalas') fm, fl = ft.dfs(target_entity="customers", entityset=es, trans_primitives=[TimeSince], agg_primitives=[], max_depth=1) customers_datetime_vars = [ id for id, t in es['customers'].variable_types.items() if issubclass(t, Datetime) ] expected_names = [f"TIME_SINCE({v})" for v in customers_datetime_vars] for name in expected_names: assert name in fm.columns
def test_transform_subset(X_y_binary, X_y_multi, X_y_regression): datasets = locals() for dataset in datasets.values(): X, y = dataset X_pd = pd.DataFrame(X) X_pd.columns = X_pd.columns.astype(str) X_fit = X_pd.iloc[: len(X) // 3] X_transform = X_pd.iloc[len(X) // 3:] es = ft.EntitySet() es = es.entity_from_dataframe(entity_id="X", dataframe=X_transform, index='index', make_index=True) feature_matrix, features = ft.dfs(entityset=es, target_entity="X") feature = DFSTransformer() feature.fit(X_fit) X_t = feature.transform(X_transform) assert_frame_equal(feature_matrix, X_t.to_dataframe())
def dfs_run(self): self.__feature_dataframe, _ = ft.dfs( entityset=self.__es, target_entity="application_train", agg_primitives=[ft.primitives.aggregation_primitives.Sum, ft.primitives.aggregation_primitives.Std, ft.primitives.aggregation_primitives.Max, ft.primitives.aggregation_primitives.Min, ft.primitives.aggregation_primitives.Mean, ft.primitives.aggregation_primitives.Count, ft.primitives.aggregation_primitives.NUnique, ft.primitives.aggregation_primitives.Mode], trans_primitives=[], verbose=True, chunk_size=110 # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间 ) self.__feature_dataframe.to_csv(os.path.join(self.__output_path, self.__output_file_name), index=False)
def test_pickle_features(es): features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') ft.save_features(features_original, filepath) features_deserialized = ft.load_features(filepath) for feat_1, feat_2 in zip(features_original, features_deserialized): assert feat_1.unique_name() == feat_2.unique_name() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) os.remove(filepath)
def __init__(self, data): es = ft.EntitySet("transactions") es = es.entity_from_dataframe(entity_id='entities_transactions', dataframe=data, index='index_col') es.normalize_entity(base_entity_id='entities_transactions', new_entity_id='origin', index='type') fm, features = ft.dfs(entityset=es, target_entity='entities_transactions') self.feature_matrix = fm self.features = features return
def fit(self, X, y=None): """Fits the DFSTransformer Transformer component. Arguments: X (ww.DataTable, pd.DataFrame, np.array): The input data to transform, of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples] Returns: self """ X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) X.columns = X.columns.astype(str) es = self._make_entity_set(X) self.features = dfs(entityset=es, target_entity='X', features_only=True) return self
def engineer_features_uk_retail(entities, relationships, label_times, training_window): trans_primitives = [Minute, Hour, Day, Week, Month, Weekday, Weekend] es = ft.EntitySet("entityset", entities=entities, relationships=relationships) es.add_last_time_indexes() feature_matrix, features = ft.dfs(entityset=es, target_entity="customers", trans_primitives=trans_primitives, agg_primitives=[Mean,Max,Std], cutoff_time=label_times[["CustomerID", "cutoff_time"]], training_window=training_window) feature_matrix.drop("Country", axis=1, inplace=True) feature_matrix = feature_matrix.sort_index() return feature_matrix
def test_time_since_primitive_matches_all_datetime_types(es): if es.dataframe_type == Library.KOALAS.value: pytest.xfail( 'TimeSince transform primitive is incompatible with Koalas') fm, fl = ft.dfs(target_dataframe_name="customers", entityset=es, trans_primitives=[TimeSince], agg_primitives=[], max_depth=1) customers_datetime_cols = [ id for id, t in es['customers'].ww.logical_types.items() if isinstance(t, Datetime) ] expected_names = [f"TIME_SINCE({v})" for v in customers_datetime_cols] for name in expected_names: assert name in fm.columns
def test_make_transform_multiple_output_features(es): def test_f(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] def gen_feat_names(self): subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"] return [ "Now.%s(%s)" % (subname, self.base_features[0].get_name()) for subname in subnames ] TestTime = make_trans_primitive( function=test_f, input_types=[Datetime], return_type=Numeric, number_output_features=6, cls_attributes={"get_feature_names": gen_feat_names}, ) join_time_split = ft.Feature(es["log"]["datetime"], primitive=TestTime) alt_features = [ ft.Feature(es["log"]["datetime"], primitive=Year), ft.Feature(es["log"]["datetime"], primitive=Month), ft.Feature(es["log"]["datetime"], primitive=Day), ft.Feature(es["log"]["datetime"], primitive=Hour), ft.Feature(es["log"]["datetime"], primitive=Minute), ft.Feature(es["log"]["datetime"], primitive=Second) ] fm, fl = ft.dfs( entityset=es, target_entity="log", trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second]) subnames = join_time_split.get_feature_names() altnames = [f.get_name() for f in alt_features] for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all() # check no feature stacked on new primitive for feature in fl: for base_feature in feature.base_features: assert base_feature.hash() != join_time_split.hash()
def test_remove_single_value_features(): same_vals_df = pd.DataFrame({ 'id': [0, 1, 2, 3], 'all_numeric': [88, 88, 88, 88], 'with_nan': [1, 1, None, 1], "all_nulls": [None, None, None, None], 'all_categorical': ['a', 'a', 'a', 'a'], 'all_bools': [True, True, True, True], 'diff_vals': ['hi', 'bye', 'bye', 'hi'] }) es = ft.EntitySet("data", {'single_vals': (same_vals_df, 'id')}) es['single_vals'].ww.set_types( logical_types={ 'all_nulls': 'categorical', 'all_categorical': 'categorical', 'diff_vals': 'categorical' }) fm, features = ft.dfs(entityset=es, target_dataframe_name="single_vals", trans_primitives=['is_null'], max_depth=2) no_params, no_params_features = ft.selection.remove_single_value_features( fm, features) no_params_cols = set(no_params.columns) assert len(no_params_features) == 2 assert 'IS_NULL(with_nan)' in no_params_cols assert 'diff_vals' in no_params_cols nan_as_value, nan_as_value_features = ft.selection.remove_single_value_features( fm, features, count_nan_as_value=True) nan_cols = set(nan_as_value.columns) assert len(nan_as_value_features) == 3 assert 'IS_NULL(with_nan)' in nan_cols assert 'diff_vals' in nan_cols assert 'with_nan' in nan_cols without_features_param = ft.selection.remove_single_value_features(fm) assert len(no_params.columns) == len(without_features_param.columns) for i in range(len(no_params.columns)): assert no_params.columns[i] == without_features_param.columns[i] assert no_params_features[i].get_name( ) == without_features_param.columns[i]
def dfs_run(es, output_path): """ AvgTimeBetween: 不同事件的平均时间间隔 等同于 Mean(Diff(time_index)) Trend: 线性趋势的斜率 PercentTrue: Boolean 特征的 True 值 占比 where_primitives :是应用在 interesting_values 上的 """ train_feature, _ = ft.dfs( entityset=es, target_entity="application_train", agg_primitives=[Sum, Std, Max, Min, Median, Count, PercentTrue, Trend, AvgTimeBetween, Skew], where_primitives=[Std, Max, Min, Median, Count, Skew], verbose=True, chunk_size=70, # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间 ) train_feature.to_csv(os.path.join(output_path, "train_pre_agg_0-5.csv"), index=True)
def test_encode_unknown_features(): # Dataframe with categorical column with "unknown" string df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']}) es = EntitySet('test') es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=es, target_entity='a') # Specify unknown token for replacement features_enc, feature_defs_enc = encode_features(features, feature_defs, include_unknown=True) assert list(features_enc.columns) == [ 'category = unknown', 'category = e', 'category = d', 'category = c', 'category = b', 'category is unknown' ]
def test_deserialize_features_s3(es, url, profile_name): agg_primitives = [ Sum, Std, Max, Skew, Min, Mean, Count, PercentTrue, NumUnique, Mode ] trans_primitives = [ Day, Year, Month, Weekday, Haversine, NumWords, NumCharacters ] features_original = sorted(ft.dfs(target_entity='sessions', entityset=es, features_only=True, agg_primitives=agg_primitives, trans_primitives=trans_primitives), key=lambda x: x.unique_name()) features_deserialized = sorted(ft.load_features(url, profile_name=profile_name), key=lambda x: x.unique_name()) assert_features(features_original, features_deserialized)
def get_feats(self, df): self.es.entity_from_dataframe('df', df.copy(), index='ID') print("把类别当做ID拆成新表") # 这只是单key聚合 for v in self.es['df'].variables: if v.dtype == 'categorical': self.es.normalize_entity('df', f'df_{v.name}', v.name) # todo: 多key聚合(如果不支持,先组合成单key)或者 获得 重要类别的子集组合 # 多类别交叉得到更细的分组统计信息 self.es.plot() df_feats, _ = ft.dfs(entityset=self.es, target_entity='df', verbose=1, max_depth=3, n_jobs=3) return df_feats