def _select_stores(self): pre = self.pre months = self.months if not os.path.exists(self.base_path): os.mkdir(self.base_path) with_kind = os.path.join(self.base_path, self.kind) if not os.path.exists(with_kind): os.mkdir(with_kind) stores = {} if months is None: base_names = (os.path.join(with_kind, f) for f in os.listdir(with_kind) if f.endswith('.h5')) else: if isinstance(months[0], list): base_names = (os.path.join(self.base_path, self.kind, make_chunk_name(chunk)) + '.h5' for chunk in months) else: base_names = (os.path.join(self.base_path, self.kind, pre + month + '.h5') for month in months) for name in base_names: k, _ = os.path.splitext(os.path.basename(name)) stores[k] = pd.HDFStore(name) return stores
def make_to_long(panel_h, settings, start=None, stop=None): """ Let's chunk by quarters. """ # need compensation for real wage with open('../panel_construction/settings.txt', 'rt') as f: settings = json.load(f) analyzed = pd.HDFStore(settings['analyzed_path']) comp = analyzed.select('bls_productivity_compensation')['compensation'] prod = analyzed.select('bls_productivity_compensation')['productivity'] keys = sorted(panel_h.stores.keys()) m0 = start or keys[0] m0 = date_parser(m0) mn = stop or keys[-1] mn = date_parser(mn) months = [ x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn) if x.strftime('m%Y_%m') in keys ] # Getting some memory pressure. break into chunks, write each out. # read proccessed chucnks. # Chunking by quarter month_chunks = chunk_quarters(months, 3) month_chunks = [x for x in month_chunks if len(x) > 0] p = pathlib.Path(str(settings['base_path'])) out_store = HDFHandler(str(p), kind='long', months=month_chunks, frequency='Q') earn_store = HDFHandler(str(p), kind='earn', months=month_chunks, frequency='Q') for chunk in month_chunks: # need the three month chunks... maybe zip up with out_stoure. # may need another dict. df = read_to_long(panel_h, chunk) name = make_chunk_name(chunk) # out_store.write(df, name, format='table', append=False) s = out_store.stores[name] # add in real hourly wage c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100 # adjust weight decimals df.loc[:, 'og_weight'] = df['og_weight'] / 10000 # CPS reports earnings in cents df.loc[:, 'earnings'] = df['earnings'] / 100 df['real_hr_earns'] = (df['earnings'] / df['hours']) / c df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan) # div by 0 df = replace_categorical(df, kind='flow', inverse=True) with pd.get_store(s.filename) as store: df.to_hdf(store, name, format='table', append=False) #---------------------------------------------------------------- # Also write out just earnings (nan issues so can't select later) # need to make real hrs fisrt. earn = df[~pd.isnull(df.real_hr_earns)] earn = earn[(earn.hours > 0) & (earn.earnings > 0)] s = earn_store.stores[name] with pd.get_store(s.filename) as store: earn.to_hdf(store, name, format='table', append=False, data_columns=True) print("Finished " + str(chunk)) # finally, chunk by quarter and write out. df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1) df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr']) df = quarterize(df) df['productivity'] = prod.reindex(df.index, level='qmonth') df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan) # df = add_demo_dummies(df) # model, res = construct_wage_index(df) # df.loc[:, 'wage_index_res'] = res.resid cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5' with pd.get_store(cln_path) as store: df.to_hdf(store, 'cleaned', format='f', append=False) out_store.close() analyzed.close() earn_store.close()
def make_to_long(panel_h, settings, start=None, stop=None): """ Let's chunk by quarters. """ # need compensation for real wage with open('../panel_construction/settings.txt', 'rt') as f: settings = json.load(f) analyzed = pd.HDFStore(settings['analyzed_path']) comp = analyzed.select('bls_productivity_compensation')['compensation'] prod = analyzed.select('bls_productivity_compensation')['productivity'] keys = sorted(panel_h.stores.keys()) m0 = start or keys[0] m0 = date_parser(m0) mn = stop or keys[-1] mn = date_parser(mn) months = [x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn) if x.strftime('m%Y_%m') in keys] # Getting some memory pressure. break into chunks, write each out. # read proccessed chucnks. # Chunking by quarter month_chunks = chunk_quarters(months, 3) month_chunks = [x for x in month_chunks if len(x) > 0] p = pathlib.Path(str(settings['base_path'])) out_store = HDFHandler(str(p), kind='long', months=month_chunks, frequency='Q') earn_store = HDFHandler(str(p), kind='earn', months=month_chunks, frequency='Q') for chunk in month_chunks: # need the three month chunks... maybe zip up with out_stoure. # may need another dict. df = read_to_long(panel_h, chunk) name = make_chunk_name(chunk) # out_store.write(df, name, format='table', append=False) s = out_store.stores[name] # add in real hourly wage c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100 # adjust weight decimals df.loc[:, 'og_weight'] = df['og_weight'] / 10000 # CPS reports earnings in cents df.loc[:, 'earnings'] = df['earnings'] / 100 df['real_hr_earns'] = (df['earnings'] / df['hours']) / c df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan) # div by 0 df = replace_categorical(df, kind='flow', inverse=True) with pd.get_store(s.filename) as store: df.to_hdf(store, name, format='table', append=False) #---------------------------------------------------------------- # Also write out just earnings (nan issues so can't select later) # need to make real hrs fisrt. earn = df[~pd.isnull(df.real_hr_earns)] earn = earn[(earn.hours > 0) & (earn.earnings > 0)] s = earn_store.stores[name] with pd.get_store(s.filename) as store: earn.to_hdf(store, name, format='table', append=False, data_columns=True) print("Finished " + str(chunk)) # finally, chunk by quarter and write out. df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1) df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr']) df = quarterize(df) df['productivity'] = prod.reindex(df.index, level='qmonth') df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan) # df = add_demo_dummies(df) # model, res = construct_wage_index(df) # df.loc[:, 'wage_index_res'] = res.resid cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5' with pd.get_store(cln_path) as store: df.to_hdf(store, 'cleaned', format='f', append=False) out_store.close() analyzed.close() earn_store.close()