Example #1
0
 def fit(self, df, y=None, **fit_params):
     logger.log('*fit* Filter columns ({}) {}'.format(
         self.__class__.__name__, df.shape).format(self.__class__),
                new_level=True)
     if df.empty:
         self.cols_to_keep = []
     else:
         self.cols_to_keep = self.get_columns_to_keep(df, y, **fit_params)
     logger.end_log_level()
     return self
Example #2
0
    def transform(self, df):
        logger.log('Segment df {}'.format(df.shape), new_level=True)

        logger.log('Get Segments')
        df_segments = self.__segment(df)
        logger.log('Apply n={} Segments to df.shape = {}'.format(
            df_segments.shape[0], df.shape))
        out_df = apply_segments(df, df_segments)
        logger.end_log_level()

        return out_df
    def get_etl_info_df(self,components):
        """
        Each component must have been extracted with save_steps = True
        """
        all_etl_info = []
        for comp in components:
            logger.log(comp,new_level=True)
            etl_info = self.get_etl_info(comp)
            all_etl_info.append(etl_info)
            logger.end_log_level()

        return pd.DataFrame(all_etl_info)
Example #4
0
    def transform(self, X):
        logger.log('Load data from component: {}'.format(self.component.upper()),new_level=True)
        if isinstance(X,pd.DataFrame) or isinstance(X,pd.Series):
            X = X.index
        if isinstance(X, pd.Index):
            ids=X.get_level_values(column_names.ID).unique().tolist()
        else: ids=X

        df_component = self.etl_manager.open_df(self.component,ids=ids)

        logger.end_log_level()

        return df_component
Example #5
0
def smart_join(hdf5_fname,paths,joined_path,ids,
                                        chunksize=5000,
                                        need_deconstruct=True,
                                        hdf5_fname_for_join=None,
                                        overwrite=True):

    logger.log('Smart join: n={}, {}'.format(len(ids),paths),new_level=True)

    if hdf5_fname_for_join is None: hdf5_fname_for_join=hdf5_fname

    store = pd.HDFStore(hdf5_fname_for_join)
    if (joined_path in store):
        if overwrite: del store[joined_path]
        else :
            store.close()
            logger.end_log_level()
            return hdf5_fname_for_join
    #sort ids, should speed up where clauses and selects
    ids = sorted(ids)

    #do chunked join
    logger.log('JOINING dataframes',new_level=True)
    for ix_start in range(0,len(ids),chunksize):
        ix_end = min(ix_start + chunksize,len(ids))
        id_slice = ids[ix_start:ix_end]

        where = '{id_col} in {id_list}'.format(id_col=column_names.ID,id_list=id_slice)

        logger.log('Slice & Join: {} --> {}, n={}'.format(id_slice[0], id_slice[-1],len(id_slice)),new_level=True)
        df_slice = None
        # for path in df_dict.keys():
        for path in paths:
            try:
                logger.log(path)
                if need_deconstruct: slice_to_add = read_and_reconstruct(hdf5_fname,path,where=where)
                else: slice_to_add = pd.read_hdf(hdf5_fname,path,where=where)
            except KeyError as err:
                logger.log(end_prev=True,start=False)
                print err
                continue

            if df_slice is None: df_slice = slice_to_add
            else:
                df_slice = df_slice.join(slice_to_add,how='outer')
                del slice_to_add

        logger.end_log_level()
        logger.log('Append slice')

        if need_deconstruct: deconstruct_and_write(df_slice,hdf5_fname_for_join,joined_path,append=True)
        else: df_slice.to_hdf(hdf5_fname_for_join,joined_path,append=True,format='t')

        del df_slice

    logger.end_log_level()
    logger.end_log_level()

    return hdf5_fname_for_join
Example #6
0
    def do_union(self,X, is_fit, y=None, **fit_params):

        logger.log('Begin union for {} transformers'.format(len(self.featurizers)),new_level=True)
        df_features = None

        for f in self.featurizers:
            logger.log(f[0],new_level=True)
            
            if is_fit: df_ft = f[1].fit_transform(X)
            else: df_ft = f[1].transform(X)
            if self.add_name_level:
                df_ft = utils.add_same_val_index_level(df_ft,level_val=f[0],level_name=FEATURE_LEVEL,axis=1)
            if df_features is None: df_features = df_ft
            else: df_features = df_features.join(df_ft,how='outer')
            del df_ft

            logger.end_log_level()
        logger.end_log_level()
        return df_features
Example #7
0
    def transform(self, df):
        if df.empty: return df

        logger.log('Nominal to OneHot', new_level=True)
        nominal_cols = df.columns.get_level_values(
            'variable_type') == variable_type.NOMINAL

        for col_name in df.loc[:, nominal_cols]:
            column = df[col_name]
            df.drop(col_name, axis=1, inplace=True)
            df_dummies = pd.get_dummies(column)
            if df_dummies.empty: continue
            dummy_col_names = [
                col_name[:-1] + ('{}_{}'.format(col_name[-1], text), )
                for text in df_dummies.columns
            ]
            df_dummies.columns = pd.MultiIndex.from_tuples(
                dummy_col_names, names=df.columns.names)
            df = df.join(df_dummies, how='outer')
        logger.end_log_level()
        return df
Example #8
0
    def make_feature_set(self, ids, fit, y=None, **fit_params):
        logger.log("Make Feature Set. id_count={}, #features={}, components=".format(len(ids),len(self.featurizers),self.components),new_level=True)
        if fit:
            self.comp_preprocessors = [(c,self.preprocessor_pipeline(c)) for c in self.components]

        adjusted_featurizers = [(ft_name,self.adjust_featurizer(ft)) for ft_name,ft in self.featurizers]

        pipeline_steps = [
            ('pre_processors',FeatureUnionDF(self.comp_preprocessors, add_name_level=False)),
            ('feature_union',FeatureUnionDF(adjusted_featurizers)),
            ('post_processor',self.post_processor),
        ]

        if self.should_fillna:
            pipeline_steps.append(('fillna',LocAndFillNaN(self.featurizers)))

        ft_union_pipeline = Pipeline(pipeline_steps)
        if fit: df = ft_union_pipeline.fit_transform(ids, y, **fit_params)
        else: df = ft_union_pipeline.transform(ids)

        logger.end_log_level()
        return df
Example #9
0
    def fit(self, df, y=None, **fit_params):
        logger.log('FIT Combine like columns {}'.format(df.shape),
                   new_level=True)

        self.columns_to_combine = {}
        groupby_cols = list(df.columns.names)
        groupby_cols.remove(column_names.DESCRIPTION)
        grouped = df.groupby(level=groupby_cols, axis=1)

        column_list = []
        df_out = None
        for index, group in grouped:
            index
            logger.log(index)
            if index[2] == variable_type.NOMINAL: continue

            ordered_cols = group[group.count().sort_values(
                ascending=False).index.tolist()].columns.tolist()
            self.columns_to_combine[index] = ordered_cols

        logger.end_log_level()
        return self
Example #10
0
 def transform(self, df):
     logger.log('Drop OOB data | {}'.format(df.shape), new_level=True)
     df = df.copy()
     idx = pd.IndexSlice
     df = df.sort_index(axis=1).sort_index()
     for component in df.columns.get_level_values(
             'component').unique().tolist():
         component_defs = self.data_dict.defs_for_component(component)
         for units in df[component].columns.get_level_values(
                 column_names.UNITS).unique().tolist():
             df_slice = df.loc[:, idx[component, :, :, units, :]]
             logger.log('{}, {}, {}'.format(component, units,
                                            df_slice.count().sum()))
             matching_defs = component_defs[(component_defs.units == units)]
             if matching_defs.empty: continue
             def_row = matching_defs.iloc[0]
             lower = def_row['lower']
             upper = def_row['upper']
             df.loc[:, idx[component, :, :, units, :]] = remove_oob_values(
                 df_slice, lower, upper)
     df.dropna(how='all', inplace=True, axis=1)
     logger.end_log_level()
     return df
Example #11
0
    def transform(self, df):
        logger.log('TRANSFORM Combine like columns {}'.format(df.shape),
                   new_level=True)

        column_list = []
        for index, columns in self.columns_to_combine.iteritems():
            logger.log(index)
            df_list = []
            for col_name in columns:
                if col_name not in df.columns:
                    df[col_name] = pd.np.nan
                col = df[col_name].dropna()
                col.name = index + (ALL, )
                df_list.append(col)

            df_combined = pd.concat(df_list).to_frame()

            # Here we will drop all duplicate values; since we sort the max col first,
            # BEFORE we loop and combine, we will be prioritizing all values from the max value
            # column. Although this may be a change in style from previous, it is easy, and will
            # most of the time be RIGHT.
            duplicates_to_drop = df_combined.index.duplicated(keep='first')
            df_combined = df_combined.loc[~duplicates_to_drop]

            #drop the combined columns
            df.drop(columns, axis=1, inplace=True)

            #join the combined column back to the DF
            df = df.join(df_combined, how='outer')

        df.columns.names = df.columns.names
        df.sort_index(inplace=True)
        df.sort_index(inplace=True, axis=1)

        logger.end_log_level()

        return df
Example #12
0
def dask_open_and_join(hdf5_fname,path,components,ids=ALL,chunksize=500000):

    df_all=None
    logger.log('DASK OPEN & JOIN n={} components: {}'.format(len(components),components),new_level=True)
    for component in components:
        logger.log('{}: {}/{}'.format(component.upper(),components.index(component)+1,len(components)),new_level=True)

        df_comp = open_df(hdf5_fname,'{}/{}'.format(path,component))
        df_comp.sort_index(inplace=True)
        df_comp.sort_index(inplace=True, axis=1)

        if not ids == ALL:
            df_comp = df_comp[df_comp.index.get_level_values(column_names.ID).isin(ids)]

        logger.log('Convert to dask - {}'.format(df_comp.shape))
        df_dask = dd.from_pandas(df_comp.reset_index(), chunksize=chunksize)
        del df_comp

        logger.log('Join to big DF')

        if df_all is None: df_all = df_dask
        else :
            df_all = df_all.merge(df_dask,how='outer', on=['id','datetime'])
            del df_dask
        logger.end_log_level()

    logger.log('Dask DF back to pandas')
    df_pd = df_all.compute()
    del df_all
    df_pd.set_index(['id','datetime'], inplace=True)

    logger.log('SORT Joined DF')
    df_pd.sort_index(inplace=True)
    df_pd.sort_index(inplace=True, axis=1)
    logger.end_log_level()
    return df_pd
    def etl(self,components,save_steps=False,overwrite=False):
        if not overwrite:
            components = self.get_unloaded_components(components)
        if len(components) == 0: return None
        all_etl_info = []

        logger.log('BEGIN ETL for {} components: {}'.format(len(components),components),new_level=True)
        for component in components:
            logger.log('{}: {}/{}'.format(component.upper(),components.index(component)+1,len(components)),new_level=True)

            logger.log('Extract...',new_level=True)
            df_extracted = self.extract(component)
            logger.end_log_level()

            logger.log('Transform...',new_level=True)
            df_transformed = self.transform(df_extracted,component)
            logger.end_log_level()

            logger.log('Clean...',new_level=True)
            df = self.cleaners.fit_transform(df_transformed.copy())
            logger.end_log_level()

            logger.log('Save DataFrames...',new_level=True)
            if save_steps:
                logger.log('Save EXTRACTED DF: {}'.format(df_extracted.shape))
                df_extracted.to_hdf(self.hdf5_fname,'{}/{}'.format(component,'extracted'))

                logger.log('Save TRANSFORMED DF: {}'.format(df_transformed.shape))
                df_transformed.to_hdf(self.hdf5_fname,'{}/{}'.format(component,'transformed'))

            logger.log('Save FINAL DF: {}'.format(df.shape))
            utils.deconstruct_and_write(df,self.hdf5_fname,path=component)
            logger.end_log_level()



            etl_info = self.get_etl_info(component,df_extracted,df_transformed,df)
            all_etl_info.append(etl_info)

            del df_extracted,df_transformed,df

            logger.end_log_level()



        logger.end_log_level()
        return pd.DataFrame(all_etl_info)
Example #14
0
def ETL(extractor,
        components,
        data_dict,
        same_dt_aggregator,
        hdf5_fname=None,
        joined_path=None,
        hadm_ids=ALL,
        use_base_df=True,
        to_pandas=False,
        chunksize=500000):

    logger.log('***ETL***', new_level=True)
    logger.log('SETUP', new_level=True)

    category_map = mimic_category_map(data_dict)
    ureg = units.MedicalUreg()

    transformer = transform_pipeline()

    standard_clean_pipeline = Pipeline([
        ('aggregate_same_datetime', same_dt_aggregator),
        ('split_dtype', transformers.split_dtype()),
        ('standardize_columns',
         transformers.column_standardizer(data_dict, ureg)),
        ('standardize_categories',
         transformers.standardize_categories(data_dict, category_map)),
        ('split_bad_categories', transformers.split_bad_categories(data_dict)),
        # ('one_hotter',transformers.nominal_to_onehot()),
        ('drop_oob_values', transformers.oob_value_remover(data_dict))
    ])

    should_save = (hdf5_fname is not None)

    df_base = None

    if should_save & use_base_df:
        try:
            df_base = utils.open_df(hdf5_fname, joined_path)
        except:
            pass

    if df_base is not None:

        existing_components = df_base.columns.get_level_values(
            column_names.COMPONENT).unique().tolist()
        existing_ids = set(
            df_base.index.get_level_values(column_names.ID).tolist())
        requested_ids = hadm_ids if hadm_ids != ALL else get_all_hadm_ids()

        new_ids = [ID for ID in requested_ids if ID not in existing_ids]

        #case 1: new ids in existing columns, don't try to be smart with ALL unless not a lot of IDs
        if len(new_ids) > 0:
            df_addition = ETL(extractor,
                              existing_components,
                              data_dict,
                              same_dt_aggregator,
                              hadm_ids=new_ids,
                              to_pandas=True)
            if df_addition is not None:
                df_base = pd.concat([df_base, df_addition])
            #now we only need to load NEW components
            components = [
                comp for comp in components if comp not in existing_components
            ]

        logger.log('Base DF to Dask')
        df_base = dd.from_pandas(df_base.reset_index(), chunksize=chunksize)

    df_all = df_base

    logger.log('BEGIN ETL for {} admissions and {} components: {}'.format(
        hadm_ids if hadm_ids == ALL else len(hadm_ids), len(components),
        components),
               new_level=True,
               end_level=True)
    for component in components:
        logger.log('{}: {}/{}'.format(component.upper(),
                                      components.index(component) + 1,
                                      len(components)),
                   new_level=True)
        """
        @@@@@@@@@@@@@@@
        ----EXTRACT----
        @@@@@@@@@@@@@@@
        """

        logger.log("Extracting...", new_level=True)
        df_extracted = extractor.extract_component(component, hadm_ids)

        if df_extracted.empty:
            print 'EMPTY Dataframe EXTRACTED for {}, n={} ids'.format(
                component, len(hadm_ids))
            logger.end_log_level()
            continue

        if should_save:
            logger.log('Save EXTRACTED DF = {}'.format(df_extracted.shape))
            utils.save_df(df_extracted, hdf5_fname,
                          'extracted/{}'.format(component))
        logger.end_log_level()
        """
        @@@@@@@@@@@@@@@@@
        ----TRANSFORM----
        @@@@@@@@@@@@@@@@@
        """

        logger.log("Transforming... {}".format(df_extracted.shape),
                   new_level=True)
        transformer.set_params(add_level__level_val=component)
        df_transformed = transformer.transform(df_extracted)

        print 'Data Loss (Extract > Transformed):', utils.data_loss(
            df_extracted.set_index(column_names.ID).value.to_frame(),
            df_transformed)

        if df_transformed.empty:
            print 'EMPTY Dataframe TRANSFORMED for {}, n={} ids'.format(
                component, len(hadm_ids))
            logger.end_log_level()
            continue

        if should_save:
            logger.log('Save TRANSFORMED DF = {}'.format(df_transformed.shape))
            utils.save_df(df_transformed, hdf5_fname,
                          'transformed/{}'.format(component))
        logger.end_log_level()
        """
        @@@@@@@@@@@@@@@
        -----CLEAN-----
        @@@@@@@@@@@@@@@
        """

        logger.log("Cleaning... {}".format(df_transformed.shape),
                   new_level=True)
        df = standard_clean_pipeline.transform(df_transformed)

        print 'Data Loss (Extract > Cleaned):', utils.data_loss(
            df_extracted.set_index(column_names.ID).value.to_frame(), df)

        if df.empty:
            print 'EMPTY Dataframe TRANSFORMED for {}, n={} ids'.format(
                component, len(hadm_ids))
            logger.end_log_level()
            continue

        if should_save:
            logger.log('Save CLEANED DF = {}'.format(df.shape))
            utils.save_df(df, hdf5_fname, 'cleaned/{}'.format(component))
        logger.end_log_level()

        del df_extracted, df_transformed

        logger.log('Filter & sort - {}'.format(df.shape))

        df.sort_index(inplace=True)
        df.sort_index(inplace=True, axis=1)

        logger.log('Convert to dask - {}'.format(df.shape))
        df_dask = dd.from_pandas(df.reset_index(), chunksize=chunksize)
        del df

        logger.log('Join to big DF')

        if df_all is None: df_all = df_dask
        else:
            df_all = df_all.merge(df_dask, how='outer', on=['id', 'datetime'])
            del df_dask

        logger.end_log_level()
    logger.end_log_level()

    if df_all is None or not to_pandas:
        logger.end_log_level()
        return df_all

    logger.log('Dask DF back to pandas')
    df_pd = df_all.compute()
    del df_all
    df_pd.set_index(['id', 'datetime'], inplace=True)

    logger.log('SORT Joined DF')
    df_pd.sort_index(inplace=True)
    df_pd.sort_index(inplace=True, axis=1)

    if should_save:
        logger.log('SAVE Big DF')
        utils.save_df(df_pd, hdf5_fname, joined_path)
    logger.end_log_level()

    return df_pd
Example #15
0
 def transform(self, df):
     logger.log('Clean UOM', new_level=True)
     df = clean_uom(df, self.component, self.data_dict)
     logger.end_log_level()
     return df