class FeatureUnion_df(TransformerMixin, BaseEstimator): ''' Wrapper of FeatureUnion but returning a Dataframe, the column order follows the concatenation done by FeatureUnion transformer_list: list of Pipelines ''' def __init__(self, transformer_list, n_jobs=None, transformer_weights=None, verbose=False): self.transformer_list = transformer_list self.n_jobs = n_jobs self.transformer_weights = transformer_weights self.verbose = verbose # these are necessary to work inside of GridSearch or similar self.feat_un = FeatureUnion(self.transformer_list, self.n_jobs, self.transformer_weights, self.verbose) def fit(self, X, y=None): self.feat_un.fit(X, y) return self def transform(self, X, y=None): X_tr = self.feat_un.transform(X) columns = [] for trsnf in self.transformer_list: cols = trsnf[1].steps[-1][1].get_features_name() columns += list(cols) X_tr = pd.DataFrame(X_tr, index=X.index, columns=columns) return X_tr def get_params(self, deep=True): # necessary to well behave in GridSearch return self.feat_un.get_params(deep=deep)
class DataCleaner(object): def __init__(self, path2meta, verbose=0): meta_data_df = pd.read_csv(path2meta) self.numeric_feat_list = list( meta_data_df[meta_data_df['datatype'] == 'numeric']['name']) self.bool_feat_list = list( meta_data_df[meta_data_df['datatype'] == 'bool']['name']) self.categorical_feat_list = list( meta_data_df[meta_data_df['datatype'] == 'categorical']['name']) self.CleaningPipeline = FeatureUnion(transformer_list=[ ('numeric_pipe', Pipeline([('selector', ItemSelector(key_list=self.numeric_feat_list)), ('Imputer', Imputer(strategy='mean', verbose=verbose) ), ('scalar', StandardScaler())])), ('bool_pipe', Pipeline([('selector', ItemSelector(key_list=self.bool_feat_list)), ('Imputer', Imputer(strategy='most_frequent', verbose=verbose) ), ('scalar', StandardScaler())])), ('cat_pipe', Pipeline([('selector', ItemSelector(key_list=self.categorical_feat_list)), ('Imputer', Imputer(strategy='most_frequent', verbose=verbose) ), ('encoder', OneHotEncoder()), ('scalar', StandardScaler())])), ]) def get_output_col_neams(self): "Get the column names in order of the features post cleaning" ohe = self.CleaningPipeline.get_params()['cat_pipe__encoder'] if ohe.is_fitted: new_cat_feat_list = ohe.get_new_column_names( self.categorical_feat_list) return self.numeric_feat_list + self.bool_feat_list + new_cat_feat_list else: raise (ValueError( "Cleaner must be fitted before new column names can be retrieved" )) def get_clean_dataframe(self, input_df): return pd.DataFrame(self.CleaningPipeline.fit_transform(input_df), columns=self.get_output_col_neams())
class FeatureUnionDataFrame(TransformerMixin): def __init__(self, *args, **kwargs): self.fu = FeatureUnion(*args, **kwargs) def fit(self, X, y=None, **kwargs): self.fu.fit(X, y, **kwargs) return self def transform(self, X, y=None, **fit_params): return pd.DataFrame(self.fu.transform(X), columns=self.fu.get_feature_names()) def get_feature_names(self): return self.fu.get_feature_names() def set_params(self, **kwargs): self.fu.set_params(**kwargs) def get_params(self, deep=False): return self.fu.get_params(deep)
class FeatureUnioner(DfTransformer): """ Joins all dataframes coming from multiple transformers into one dataframe """ def __init__(self, transformer_list, n_jobs=-1): self.name = "FeatureUnioner" super().log_start(self.name) self.transformer_list = transformer_list self.n_jobs = n_jobs self.feature_union = FeatureUnion(self.transformer_list, self.n_jobs) self.columns = [] def fit(self, X, y=None): self.feature_union.fit(X) return self def transform(self, X, y=None): X_transform = self.feature_union.transform(X) self.concat_df_columns() X_transform = pd.DataFrame(X_transform, index=X_transform.index, columns = self.columns) super().log_end(self.name) return X_transform def concat_df_columns(self): for transformer in self.transformer_list: columns = transformer[1].steps[-1][1].get_feature_names() self.columns += columns def get_params(self, deep = True): """ used for gridsearch """ return self.feature_union.get_params(deep=deep)