def fit(self, df, y=None, **fit_params): logger.log('*fit* Filter columns ({}) {}'.format( self.__class__.__name__, df.shape).format(self.__class__), new_level=True) if df.empty: self.cols_to_keep = [] else: self.cols_to_keep = self.get_columns_to_keep(df, y, **fit_params) logger.end_log_level() return self
def transform(self, df): logger.log('Segment df {}'.format(df.shape), new_level=True) logger.log('Get Segments') df_segments = self.__segment(df) logger.log('Apply n={} Segments to df.shape = {}'.format( df_segments.shape[0], df.shape)) out_df = apply_segments(df, df_segments) logger.end_log_level() return out_df
def get_etl_info_df(self,components): """ Each component must have been extracted with save_steps = True """ all_etl_info = [] for comp in components: logger.log(comp,new_level=True) etl_info = self.get_etl_info(comp) all_etl_info.append(etl_info) logger.end_log_level() return pd.DataFrame(all_etl_info)
def transform(self, X): logger.log('Load data from component: {}'.format(self.component.upper()),new_level=True) if isinstance(X,pd.DataFrame) or isinstance(X,pd.Series): X = X.index if isinstance(X, pd.Index): ids=X.get_level_values(column_names.ID).unique().tolist() else: ids=X df_component = self.etl_manager.open_df(self.component,ids=ids) logger.end_log_level() return df_component
def smart_join(hdf5_fname,paths,joined_path,ids, chunksize=5000, need_deconstruct=True, hdf5_fname_for_join=None, overwrite=True): logger.log('Smart join: n={}, {}'.format(len(ids),paths),new_level=True) if hdf5_fname_for_join is None: hdf5_fname_for_join=hdf5_fname store = pd.HDFStore(hdf5_fname_for_join) if (joined_path in store): if overwrite: del store[joined_path] else : store.close() logger.end_log_level() return hdf5_fname_for_join #sort ids, should speed up where clauses and selects ids = sorted(ids) #do chunked join logger.log('JOINING dataframes',new_level=True) for ix_start in range(0,len(ids),chunksize): ix_end = min(ix_start + chunksize,len(ids)) id_slice = ids[ix_start:ix_end] where = '{id_col} in {id_list}'.format(id_col=column_names.ID,id_list=id_slice) logger.log('Slice & Join: {} --> {}, n={}'.format(id_slice[0], id_slice[-1],len(id_slice)),new_level=True) df_slice = None # for path in df_dict.keys(): for path in paths: try: logger.log(path) if need_deconstruct: slice_to_add = read_and_reconstruct(hdf5_fname,path,where=where) else: slice_to_add = pd.read_hdf(hdf5_fname,path,where=where) except KeyError as err: logger.log(end_prev=True,start=False) print err continue if df_slice is None: df_slice = slice_to_add else: df_slice = df_slice.join(slice_to_add,how='outer') del slice_to_add logger.end_log_level() logger.log('Append slice') if need_deconstruct: deconstruct_and_write(df_slice,hdf5_fname_for_join,joined_path,append=True) else: df_slice.to_hdf(hdf5_fname_for_join,joined_path,append=True,format='t') del df_slice logger.end_log_level() logger.end_log_level() return hdf5_fname_for_join
def do_union(self,X, is_fit, y=None, **fit_params): logger.log('Begin union for {} transformers'.format(len(self.featurizers)),new_level=True) df_features = None for f in self.featurizers: logger.log(f[0],new_level=True) if is_fit: df_ft = f[1].fit_transform(X) else: df_ft = f[1].transform(X) if self.add_name_level: df_ft = utils.add_same_val_index_level(df_ft,level_val=f[0],level_name=FEATURE_LEVEL,axis=1) if df_features is None: df_features = df_ft else: df_features = df_features.join(df_ft,how='outer') del df_ft logger.end_log_level() logger.end_log_level() return df_features
def transform(self, df): if df.empty: return df logger.log('Nominal to OneHot', new_level=True) nominal_cols = df.columns.get_level_values( 'variable_type') == variable_type.NOMINAL for col_name in df.loc[:, nominal_cols]: column = df[col_name] df.drop(col_name, axis=1, inplace=True) df_dummies = pd.get_dummies(column) if df_dummies.empty: continue dummy_col_names = [ col_name[:-1] + ('{}_{}'.format(col_name[-1], text), ) for text in df_dummies.columns ] df_dummies.columns = pd.MultiIndex.from_tuples( dummy_col_names, names=df.columns.names) df = df.join(df_dummies, how='outer') logger.end_log_level() return df
def make_feature_set(self, ids, fit, y=None, **fit_params): logger.log("Make Feature Set. id_count={}, #features={}, components=".format(len(ids),len(self.featurizers),self.components),new_level=True) if fit: self.comp_preprocessors = [(c,self.preprocessor_pipeline(c)) for c in self.components] adjusted_featurizers = [(ft_name,self.adjust_featurizer(ft)) for ft_name,ft in self.featurizers] pipeline_steps = [ ('pre_processors',FeatureUnionDF(self.comp_preprocessors, add_name_level=False)), ('feature_union',FeatureUnionDF(adjusted_featurizers)), ('post_processor',self.post_processor), ] if self.should_fillna: pipeline_steps.append(('fillna',LocAndFillNaN(self.featurizers))) ft_union_pipeline = Pipeline(pipeline_steps) if fit: df = ft_union_pipeline.fit_transform(ids, y, **fit_params) else: df = ft_union_pipeline.transform(ids) logger.end_log_level() return df
def fit(self, df, y=None, **fit_params): logger.log('FIT Combine like columns {}'.format(df.shape), new_level=True) self.columns_to_combine = {} groupby_cols = list(df.columns.names) groupby_cols.remove(column_names.DESCRIPTION) grouped = df.groupby(level=groupby_cols, axis=1) column_list = [] df_out = None for index, group in grouped: index logger.log(index) if index[2] == variable_type.NOMINAL: continue ordered_cols = group[group.count().sort_values( ascending=False).index.tolist()].columns.tolist() self.columns_to_combine[index] = ordered_cols logger.end_log_level() return self
def transform(self, df): logger.log('Drop OOB data | {}'.format(df.shape), new_level=True) df = df.copy() idx = pd.IndexSlice df = df.sort_index(axis=1).sort_index() for component in df.columns.get_level_values( 'component').unique().tolist(): component_defs = self.data_dict.defs_for_component(component) for units in df[component].columns.get_level_values( column_names.UNITS).unique().tolist(): df_slice = df.loc[:, idx[component, :, :, units, :]] logger.log('{}, {}, {}'.format(component, units, df_slice.count().sum())) matching_defs = component_defs[(component_defs.units == units)] if matching_defs.empty: continue def_row = matching_defs.iloc[0] lower = def_row['lower'] upper = def_row['upper'] df.loc[:, idx[component, :, :, units, :]] = remove_oob_values( df_slice, lower, upper) df.dropna(how='all', inplace=True, axis=1) logger.end_log_level() return df
def transform(self, df): logger.log('TRANSFORM Combine like columns {}'.format(df.shape), new_level=True) column_list = [] for index, columns in self.columns_to_combine.iteritems(): logger.log(index) df_list = [] for col_name in columns: if col_name not in df.columns: df[col_name] = pd.np.nan col = df[col_name].dropna() col.name = index + (ALL, ) df_list.append(col) df_combined = pd.concat(df_list).to_frame() # Here we will drop all duplicate values; since we sort the max col first, # BEFORE we loop and combine, we will be prioritizing all values from the max value # column. Although this may be a change in style from previous, it is easy, and will # most of the time be RIGHT. duplicates_to_drop = df_combined.index.duplicated(keep='first') df_combined = df_combined.loc[~duplicates_to_drop] #drop the combined columns df.drop(columns, axis=1, inplace=True) #join the combined column back to the DF df = df.join(df_combined, how='outer') df.columns.names = df.columns.names df.sort_index(inplace=True) df.sort_index(inplace=True, axis=1) logger.end_log_level() return df
def dask_open_and_join(hdf5_fname,path,components,ids=ALL,chunksize=500000): df_all=None logger.log('DASK OPEN & JOIN n={} components: {}'.format(len(components),components),new_level=True) for component in components: logger.log('{}: {}/{}'.format(component.upper(),components.index(component)+1,len(components)),new_level=True) df_comp = open_df(hdf5_fname,'{}/{}'.format(path,component)) df_comp.sort_index(inplace=True) df_comp.sort_index(inplace=True, axis=1) if not ids == ALL: df_comp = df_comp[df_comp.index.get_level_values(column_names.ID).isin(ids)] logger.log('Convert to dask - {}'.format(df_comp.shape)) df_dask = dd.from_pandas(df_comp.reset_index(), chunksize=chunksize) del df_comp logger.log('Join to big DF') if df_all is None: df_all = df_dask else : df_all = df_all.merge(df_dask,how='outer', on=['id','datetime']) del df_dask logger.end_log_level() logger.log('Dask DF back to pandas') df_pd = df_all.compute() del df_all df_pd.set_index(['id','datetime'], inplace=True) logger.log('SORT Joined DF') df_pd.sort_index(inplace=True) df_pd.sort_index(inplace=True, axis=1) logger.end_log_level() return df_pd
def etl(self,components,save_steps=False,overwrite=False): if not overwrite: components = self.get_unloaded_components(components) if len(components) == 0: return None all_etl_info = [] logger.log('BEGIN ETL for {} components: {}'.format(len(components),components),new_level=True) for component in components: logger.log('{}: {}/{}'.format(component.upper(),components.index(component)+1,len(components)),new_level=True) logger.log('Extract...',new_level=True) df_extracted = self.extract(component) logger.end_log_level() logger.log('Transform...',new_level=True) df_transformed = self.transform(df_extracted,component) logger.end_log_level() logger.log('Clean...',new_level=True) df = self.cleaners.fit_transform(df_transformed.copy()) logger.end_log_level() logger.log('Save DataFrames...',new_level=True) if save_steps: logger.log('Save EXTRACTED DF: {}'.format(df_extracted.shape)) df_extracted.to_hdf(self.hdf5_fname,'{}/{}'.format(component,'extracted')) logger.log('Save TRANSFORMED DF: {}'.format(df_transformed.shape)) df_transformed.to_hdf(self.hdf5_fname,'{}/{}'.format(component,'transformed')) logger.log('Save FINAL DF: {}'.format(df.shape)) utils.deconstruct_and_write(df,self.hdf5_fname,path=component) logger.end_log_level() etl_info = self.get_etl_info(component,df_extracted,df_transformed,df) all_etl_info.append(etl_info) del df_extracted,df_transformed,df logger.end_log_level() logger.end_log_level() return pd.DataFrame(all_etl_info)
def ETL(extractor, components, data_dict, same_dt_aggregator, hdf5_fname=None, joined_path=None, hadm_ids=ALL, use_base_df=True, to_pandas=False, chunksize=500000): logger.log('***ETL***', new_level=True) logger.log('SETUP', new_level=True) category_map = mimic_category_map(data_dict) ureg = units.MedicalUreg() transformer = transform_pipeline() standard_clean_pipeline = Pipeline([ ('aggregate_same_datetime', same_dt_aggregator), ('split_dtype', transformers.split_dtype()), ('standardize_columns', transformers.column_standardizer(data_dict, ureg)), ('standardize_categories', transformers.standardize_categories(data_dict, category_map)), ('split_bad_categories', transformers.split_bad_categories(data_dict)), # ('one_hotter',transformers.nominal_to_onehot()), ('drop_oob_values', transformers.oob_value_remover(data_dict)) ]) should_save = (hdf5_fname is not None) df_base = None if should_save & use_base_df: try: df_base = utils.open_df(hdf5_fname, joined_path) except: pass if df_base is not None: existing_components = df_base.columns.get_level_values( column_names.COMPONENT).unique().tolist() existing_ids = set( df_base.index.get_level_values(column_names.ID).tolist()) requested_ids = hadm_ids if hadm_ids != ALL else get_all_hadm_ids() new_ids = [ID for ID in requested_ids if ID not in existing_ids] #case 1: new ids in existing columns, don't try to be smart with ALL unless not a lot of IDs if len(new_ids) > 0: df_addition = ETL(extractor, existing_components, data_dict, same_dt_aggregator, hadm_ids=new_ids, to_pandas=True) if df_addition is not None: df_base = pd.concat([df_base, df_addition]) #now we only need to load NEW components components = [ comp for comp in components if comp not in existing_components ] logger.log('Base DF to Dask') df_base = dd.from_pandas(df_base.reset_index(), chunksize=chunksize) df_all = df_base logger.log('BEGIN ETL for {} admissions and {} components: {}'.format( hadm_ids if hadm_ids == ALL else len(hadm_ids), len(components), components), new_level=True, end_level=True) for component in components: logger.log('{}: {}/{}'.format(component.upper(), components.index(component) + 1, len(components)), new_level=True) """ @@@@@@@@@@@@@@@ ----EXTRACT---- @@@@@@@@@@@@@@@ """ logger.log("Extracting...", new_level=True) df_extracted = extractor.extract_component(component, hadm_ids) if df_extracted.empty: print 'EMPTY Dataframe EXTRACTED for {}, n={} ids'.format( component, len(hadm_ids)) logger.end_log_level() continue if should_save: logger.log('Save EXTRACTED DF = {}'.format(df_extracted.shape)) utils.save_df(df_extracted, hdf5_fname, 'extracted/{}'.format(component)) logger.end_log_level() """ @@@@@@@@@@@@@@@@@ ----TRANSFORM---- @@@@@@@@@@@@@@@@@ """ logger.log("Transforming... {}".format(df_extracted.shape), new_level=True) transformer.set_params(add_level__level_val=component) df_transformed = transformer.transform(df_extracted) print 'Data Loss (Extract > Transformed):', utils.data_loss( df_extracted.set_index(column_names.ID).value.to_frame(), df_transformed) if df_transformed.empty: print 'EMPTY Dataframe TRANSFORMED for {}, n={} ids'.format( component, len(hadm_ids)) logger.end_log_level() continue if should_save: logger.log('Save TRANSFORMED DF = {}'.format(df_transformed.shape)) utils.save_df(df_transformed, hdf5_fname, 'transformed/{}'.format(component)) logger.end_log_level() """ @@@@@@@@@@@@@@@ -----CLEAN----- @@@@@@@@@@@@@@@ """ logger.log("Cleaning... {}".format(df_transformed.shape), new_level=True) df = standard_clean_pipeline.transform(df_transformed) print 'Data Loss (Extract > Cleaned):', utils.data_loss( df_extracted.set_index(column_names.ID).value.to_frame(), df) if df.empty: print 'EMPTY Dataframe TRANSFORMED for {}, n={} ids'.format( component, len(hadm_ids)) logger.end_log_level() continue if should_save: logger.log('Save CLEANED DF = {}'.format(df.shape)) utils.save_df(df, hdf5_fname, 'cleaned/{}'.format(component)) logger.end_log_level() del df_extracted, df_transformed logger.log('Filter & sort - {}'.format(df.shape)) df.sort_index(inplace=True) df.sort_index(inplace=True, axis=1) logger.log('Convert to dask - {}'.format(df.shape)) df_dask = dd.from_pandas(df.reset_index(), chunksize=chunksize) del df logger.log('Join to big DF') if df_all is None: df_all = df_dask else: df_all = df_all.merge(df_dask, how='outer', on=['id', 'datetime']) del df_dask logger.end_log_level() logger.end_log_level() if df_all is None or not to_pandas: logger.end_log_level() return df_all logger.log('Dask DF back to pandas') df_pd = df_all.compute() del df_all df_pd.set_index(['id', 'datetime'], inplace=True) logger.log('SORT Joined DF') df_pd.sort_index(inplace=True) df_pd.sort_index(inplace=True, axis=1) if should_save: logger.log('SAVE Big DF') utils.save_df(df_pd, hdf5_fname, joined_path) logger.end_log_level() return df_pd
def transform(self, df): logger.log('Clean UOM', new_level=True) df = clean_uom(df, self.component, self.data_dict) logger.end_log_level() return df