Example #1
0
    def _select_stores(self):
        pre = self.pre
        months = self.months

        if not os.path.exists(self.base_path):
            os.mkdir(self.base_path)

        with_kind = os.path.join(self.base_path, self.kind)
        if not os.path.exists(with_kind):
            os.mkdir(with_kind)

        stores = {}
        if months is None:
            base_names = (os.path.join(with_kind, f) for f in os.listdir(with_kind)
                          if f.endswith('.h5'))
        else:
            if isinstance(months[0], list):
                base_names = (os.path.join(self.base_path, self.kind,
                              make_chunk_name(chunk)) + '.h5'
                              for chunk in months)
            else:
                base_names = (os.path.join(self.base_path, self.kind,
                              pre + month + '.h5') for month in months)
        for name in base_names:
            k, _ = os.path.splitext(os.path.basename(name))
            stores[k] = pd.HDFStore(name)
        return stores
Example #2
0
    def _select_stores(self):
        pre = self.pre
        months = self.months

        if not os.path.exists(self.base_path):
            os.mkdir(self.base_path)

        with_kind = os.path.join(self.base_path, self.kind)
        if not os.path.exists(with_kind):
            os.mkdir(with_kind)

        stores = {}
        if months is None:
            base_names = (os.path.join(with_kind, f)
                          for f in os.listdir(with_kind) if f.endswith('.h5'))
        else:
            if isinstance(months[0], list):
                base_names = (os.path.join(self.base_path, self.kind,
                                           make_chunk_name(chunk)) + '.h5'
                              for chunk in months)
            else:
                base_names = (os.path.join(self.base_path, self.kind,
                                           pre + month + '.h5')
                              for month in months)
        for name in base_names:
            k, _ = os.path.splitext(os.path.basename(name))
            stores[k] = pd.HDFStore(name)
        return stores
Example #3
0
def make_to_long(panel_h, settings, start=None, stop=None):
    """
    Let's chunk by quarters.
    """

    # need compensation for real wage
    with open('../panel_construction/settings.txt', 'rt') as f:
        settings = json.load(f)

    analyzed = pd.HDFStore(settings['analyzed_path'])
    comp = analyzed.select('bls_productivity_compensation')['compensation']
    prod = analyzed.select('bls_productivity_compensation')['productivity']

    keys = sorted(panel_h.stores.keys())

    m0 = start or keys[0]
    m0 = date_parser(m0)

    mn = stop or keys[-1]
    mn = date_parser(mn)

    months = [
        x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn)
        if x.strftime('m%Y_%m') in keys
    ]

    # Getting some memory pressure. break into chunks, write each out.
    # read proccessed chucnks.
    # Chunking by quarter

    month_chunks = chunk_quarters(months, 3)
    month_chunks = [x for x in month_chunks if len(x) > 0]
    p = pathlib.Path(str(settings['base_path']))
    out_store = HDFHandler(str(p),
                           kind='long',
                           months=month_chunks,
                           frequency='Q')
    earn_store = HDFHandler(str(p),
                            kind='earn',
                            months=month_chunks,
                            frequency='Q')

    for chunk in month_chunks:
        # need the three month chunks... maybe zip up with out_stoure.
        # may need another dict.
        df = read_to_long(panel_h, chunk)
        name = make_chunk_name(chunk)

        # out_store.write(df, name, format='table', append=False)
        s = out_store.stores[name]

        # add in real hourly wage
        c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100

        # adjust weight decimals
        df.loc[:, 'og_weight'] = df['og_weight'] / 10000

        # CPS reports earnings in cents
        df.loc[:, 'earnings'] = df['earnings'] / 100

        df['real_hr_earns'] = (df['earnings'] / df['hours']) / c
        df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf,
                                                          np.nan)  # div by 0

        df = replace_categorical(df, kind='flow', inverse=True)
        with pd.get_store(s.filename) as store:
            df.to_hdf(store, name, format='table', append=False)

        #----------------------------------------------------------------
        # Also write out just earnings (nan issues so can't select later)
        # need to make real hrs fisrt.
        earn = df[~pd.isnull(df.real_hr_earns)]
        earn = earn[(earn.hours > 0) & (earn.earnings > 0)]

        s = earn_store.stores[name]
        with pd.get_store(s.filename) as store:
            earn.to_hdf(store,
                        name,
                        format='table',
                        append=False,
                        data_columns=True)
        print("Finished " + str(chunk))

    # finally, chunk by quarter and write out.
    df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1)
    df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr'])

    df = quarterize(df)

    df['productivity'] = prod.reindex(df.index, level='qmonth')
    df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan)
    # df = add_demo_dummies(df)
    # model, res = construct_wage_index(df)
    # df.loc[:, 'wage_index_res'] = res.resid

    cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5'

    with pd.get_store(cln_path) as store:
        df.to_hdf(store, 'cleaned', format='f', append=False)

    out_store.close()
    analyzed.close()
    earn_store.close()
Example #4
0
def make_to_long(panel_h, settings, start=None, stop=None):
    """
    Let's chunk by quarters.
    """

    # need compensation for real wage
    with open('../panel_construction/settings.txt', 'rt') as f:
        settings = json.load(f)

    analyzed = pd.HDFStore(settings['analyzed_path'])
    comp = analyzed.select('bls_productivity_compensation')['compensation']
    prod = analyzed.select('bls_productivity_compensation')['productivity']

    keys = sorted(panel_h.stores.keys())

    m0 = start or keys[0]
    m0 = date_parser(m0)

    mn = stop or keys[-1]
    mn = date_parser(mn)

    months = [x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn)
              if x.strftime('m%Y_%m') in keys]

    # Getting some memory pressure. break into chunks, write each out.
    # read proccessed chucnks.
    # Chunking by quarter

    month_chunks = chunk_quarters(months, 3)
    month_chunks = [x for x in month_chunks if len(x) > 0]
    p = pathlib.Path(str(settings['base_path']))
    out_store = HDFHandler(str(p), kind='long', months=month_chunks,
                           frequency='Q')
    earn_store = HDFHandler(str(p), kind='earn', months=month_chunks,
                            frequency='Q')

    for chunk in month_chunks:
        # need the three month chunks... maybe zip up with out_stoure.
        # may need another dict.
        df = read_to_long(panel_h, chunk)
        name = make_chunk_name(chunk)

        # out_store.write(df, name, format='table', append=False)
        s = out_store.stores[name]

        # add in real hourly wage
        c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100

        # adjust weight decimals
        df.loc[:, 'og_weight'] = df['og_weight'] / 10000

        # CPS reports earnings in cents
        df.loc[:, 'earnings'] = df['earnings'] / 100

        df['real_hr_earns'] = (df['earnings'] / df['hours']) / c
        df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan)  # div by 0

        df = replace_categorical(df, kind='flow', inverse=True)
        with pd.get_store(s.filename) as store:
            df.to_hdf(store, name, format='table', append=False)

        #----------------------------------------------------------------
        # Also write out just earnings (nan issues so can't select later)
        # need to make real hrs fisrt.
        earn = df[~pd.isnull(df.real_hr_earns)]
        earn = earn[(earn.hours > 0) & (earn.earnings > 0)]

        s = earn_store.stores[name]
        with pd.get_store(s.filename) as store:
            earn.to_hdf(store, name, format='table', append=False,
                        data_columns=True)
        print("Finished " + str(chunk))

    # finally, chunk by quarter and write out.
    df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1)
    df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr'])

    df = quarterize(df)

    df['productivity'] = prod.reindex(df.index, level='qmonth')
    df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan)
    # df = add_demo_dummies(df)
    # model, res = construct_wage_index(df)
    # df.loc[:, 'wage_index_res'] = res.resid

    cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5'

    with pd.get_store(cln_path) as store:
        df.to_hdf(store, 'cleaned', format='f', append=False)

    out_store.close()
    analyzed.close()
    earn_store.close()