def clean_features(or_df: pd.DataFrame, features_def, threshold=dict(), count_nan=False, **kwds) -> pd.DataFrame: ''' clean features,if you want plot features please use AutoSelect;contain remove_highly_null_features; remove_single_value_features;remove_highly_correlated_features. ''' #drop cols by remove function or_df, features_def = remove_highly_null_features( or_df, features=features_def, pct_null_threshold=threshold.get('remove_null', 0.95)) or_df, features_def = remove_single_value_features( or_df, features=features_def, count_nan_as_value=count_nan) or_df, features_def = remove_highly_correlated_features( or_df, features=features_def, pct_corr_threshold=threshold.get('remove_corr', 0.95), **kwds) return or_df, features_def
def run_dfs(self, max_depth=1, features_only=True, ignore_variables=None, reduce_mem=False, reduce_feats=True, trans_primitives=None, agg_primitives=None, chunk_size=None, n_jobs=1, **kwargs): """Deep Feature Synthesisf agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation Feature types to apply. Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"] DateTime: ['time_since_last', 'time_since_first', 'trend'] trans_primitives (list[str or TransformPrimitive], optional): List of Transform Feature functions to apply. Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"] groupby_trans_primitives (list[str or :class:`.primitives.TransformPrimitive`], optional): list of Transform primitives to make GroupByTransformFeatures with """ if ignore_variables is None: # ignore_variables = [self.target_entity_id, self.index] # ignore_variables = ["__id"] # 忽略单值id 会少了一些count特征 ignore_variables = [] if trans_primitives is None: trans_primitives = [ "year", "month", "day", "hour", "minute", "week", "weekday", "is_weekend", 'time_since_previous', # diff # https://stackoverflow.com/questions/60324672/how-is-time-since-previous-computed-in-featuretools Quarter(), ] _ = ft.dfs( entityset=self.es, target_entity=self. target_entity_id, # 具有唯一ID: 不重复id的base_es或者normalize_entity生成的唯一id es features_only=features_only, max_depth=max_depth, ignore_variables={self.entity_id: ignore_variables}, chunk_size=chunk_size, n_jobs=n_jobs, verbose=1, agg_primitives=agg_primitives, trans_primitives=trans_primitives, **kwargs) if features_only: return _ else: df_ = _[0].add_prefix(f'{self.entity_id}_').reset_index() if reduce_feats: cprint("remove_low_information_features") df_ = remove_low_information_features(df_) cprint("remove_single_value_features") df_ = remove_single_value_features(df_, count_nan_as_value=True) cprint("remove_duplicate_features") dups = duplicate_columns(df_) df_ = df_.drop(dups, 1) if reduce_mem: df_ = reduce_mem_usage(df_) return df_
def _reduce_feats(self, df): df = remove_low_information_features(df) df = remove_single_value_features(df, count_nan_as_value=True) df.drop(duplicate_columns(df), 1, inplace=True) return df
def prosperity( countries=['Chad', 'Togo', 'Zimbabwe', 'Ivory Coast', 'Georgia']): url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/' df = pd.read_csv(url + 'merged.csv') df = df.drop(['Unnamed: 0'], axis=1) metrics = [ 'educ', 'soci', 'heal', 'pers', 'busi', 'econ', 'safe', 'gove', 'envi' ] ranks = ['rank_' + metric for metric in metrics] drop = metrics + ranks + ['year', 'prosperity_score'] y = df['prosperity_score'] df = df.drop(drop, axis=1) df = remove_low_information_features(df) df = remove_highly_null_features(df) df = remove_single_value_features(df) df = remove_highly_correlated_features(df) X = df problem_type = 'regression' objective = 'auto' automl = evalml.automl.AutoMLSearch(problem_type=problem_type, objective=objective) #automl.search(X,y) #best_pipeline = automl.best_pipeline #best_pipeline.fit(X,y) #best_pipeline.save('prosperity_best_pipeline') best_pipeline = automl.load('prosperity_best_pipeline') test = pd.read_csv(url + 'test.csv', index_col=0) drop = ['year'] df = test.copy() df = df.drop(drop, axis=1) df = remove_low_information_features(df) df = remove_highly_null_features(df) df = remove_single_value_features(df) df = remove_highly_correlated_features(df) X = df predictions = best_pipeline.predict(X) result = pd.DataFrame() result['prosperity'] = predictions df = pd.read_csv(url + 'test.csv') temp = df[['country', 'year']] result = pd.merge(left=temp, right=result, how="left", on=[temp.index, result.index]) result = result.drop(['key_0', 'key_1'], axis=1) result['rank_prosperity'] = result.groupby("year")["prosperity"].rank( "dense", ascending=False) result['rank_prosperity'] = result['rank_prosperity'].astype('int') result = result[result['country'].isin(countries)] metric = pd.read_csv( 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Metrics/prosperity_metrics.csv' ) return result, metric
def pillar(name='busi', countries=['Chad']): url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/' df = pd.read_csv(url + name + '_train.csv') df = df.drop(['Unnamed: 0'], axis=1) for i in df.columns: if i.find('year') > -1: df = df.drop([i], axis=1) y = df[name] df = df.drop(['rank_' + name, name], axis=1) df = remove_low_information_features(df) df = remove_highly_null_features(df) df = remove_single_value_features(df) df = remove_highly_correlated_features(df) X = df problem_type = 'regression' objective = 'auto' automl = evalml.automl.AutoMLSearch(problem_type=problem_type, objective=objective) best_pipeline = automl.load(name + '_best_pipeline') df = pd.read_csv(url + name + '_test.csv') df = df.drop(['Unnamed: 0'], axis=1) for i in df.columns: if i.find('year') > -1: df = df.drop([i], axis=1) df = remove_low_information_features(df) df = remove_highly_null_features(df) df = remove_single_value_features(df) df = remove_highly_correlated_features(df) predictions = best_pipeline.predict(df) result = pd.DataFrame() result[name] = predictions df = pd.read_csv(url + name + '_test.csv') temp = df[['country', 'year']] result = pd.merge(left=temp, right=result, how="left", on=[temp.index, result.index]) result = result.drop(['key_0', 'key_1'], axis=1) result['rank_' + name] = result.groupby("year")[name].rank( "dense", ascending=False) result['rank_' + name] = result['rank_' + name].astype('int') result = result[result['country'].isin(countries)] metric = pd.read_csv( 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Metrics/' + name + '_metrics.csv') return result, metric
def check_single_value(or_df, count_nan=False): use_cols = remove_single_value_features( or_df, count_nan_as_value=count_nan).columns.tolist() return list(set(or_df.columns.tolist()) - set(use_cols))