def parse(): settings_file = str(_HERE_ / 'settings.json') settings = par.read_settings(settings_file) dd_path = Path(settings['dd_path']) dds = [x for x in dd_path.iterdir() if x.suffix in ('.ddf', '.asc')] monthly_path = Path(settings['monthly_path']) months = [x for x in monthly_path.iterdir() if x.suffix in ('.Z', '.zip')] settings['raise_warnings'] = False logger.info("Reading Data file") with (_HERE_ / 'data.json').open() as f: data = json.load(f) id_cols = ['HRHHID', 'HRHHID2', 'PULINENO'] dds = dds + [_HERE_ / Path('cpsm2014-01.ddf')] for dd in dds: parser = par.DDParser(dd, settings) df = parser.run() parser.write(df) logging.info("Added {} to {}".format(dd, parser.store_path)) for month in months: dd_name = par._month_to_dd(str(month)) store_path = settings['monthly_store'] dd = pd.read_hdf(settings['dd_store'], key=dd_name) cols = data['columns_by_dd'][dd_name] sub_dd = dd[dd.id.isin(cols)] if len(cols) != len(sub_dd): missing = set(cols) - set(sub_dd.id.values) raise ValueError("IDs {} are not in the Data " "Dictionary".format(missing)) with pd.get_store(store_path) as store: try: cached_cols = store.select(month.stem).columns newcols = set(cols) - set(cached_cols) - set(id_cols) if len(newcols) == 0: logger.info("Using cached {}".format(month.stem)) continue except KeyError: pass # Assuming no new rows df = par.read_monthly(str(month), sub_dd) df = par.fixup_by_dd(df, dd_name) # do special stuff like compute HRHHID2, bin things, etc. # TODO: special stuff df = df.set_index(id_cols) par.write_monthly(df, store_path, month.stem) logging.info("Added {} to {}".format(month, settings['monthly_store']))
def test_read_monthly(self): result = p.read_monthly(self.infile, self.dd) # expected = pd.DataFrame([['000000000000000', 11, 1999], # ['000000000000001', 12, 1999], # ['000000000000002', 1, 2000], # ['000000000000003', 2, 2000]], # columns=['HRHHID', 'HRMONTH', 'HRYEAR4']) expected = pd.DataFrame( [[0, 11, 1999], [1, 12, 1999], [2, 1, 2000], [3, 2, 2000]], columns=["HRHHID", "HRMONTH", "HRYEAR4"] ) tm.assert_frame_equal(result, expected)
def test_read_monthly(self): result = p.read_monthly(self.infile, self.dd) # expected = pd.DataFrame([['000000000000000', 11, 1999], # ['000000000000001', 12, 1999], # ['000000000000002', 1, 2000], # ['000000000000003', 2, 2000]], # columns=['HRHHID', 'HRMONTH', 'HRYEAR4']) expected = pd.DataFrame([[0, 11, 1999], [1, 12, 1999], [2, 1, 2000], [3, 2, 2000]], columns=['HRHHID', 'HRMONTH', 'HRYEAR4']) tm.assert_frame_equal(result, expected)
def parse(kind, settings, overwrite=False): """ Parse downloaded files, store in HDFStore. Parameters ---------- kind : {'dictionary', 'data'} settings : dict overwrite : bool """ with open(settings['info_path']) as f: info = json.load(f) s_path = {'dictionary': 'dd_path', 'data': 'monthly_path'}[kind] path_ = Path(settings[s_path]) if not path_.exists(): path_.mkdir(parents=True) suffix_d = {'data': ('.Z', '.zip'), 'dictionary': ('.ddf', '.asc', '.txt')} suffixes = suffix_d[kind] files = [x for x in path_.iterdir() if x.suffix in suffixes] id_cols = ['HRHHID', 'HRHHID2', 'PULINENO'] with open(settings['info']) as f: data = json.load(f) if kind == 'dictionary': files.append(_HERE_ / Path('cpsm2014-01.ddf')) for f in files: parser = par.DDParser(f, settings, info) df = parser.run() parser.write(df) logging.info("Added {} to {}".format(f, parser.store_path)) else: for f in files: dd_name = par._month_to_dd(str(f)) store_path = settings['monthly_store'] dd = pd.read_hdf(settings['dd_store'], key=dd_name) cols = data['columns_by_dd'][dd_name] sub_dd = dd[dd.id.isin(cols)] if len(cols) != len(sub_dd): missing = set(cols) - set(sub_dd.id.values) raise ValueError("IDs {} are not in the Data " "Dictionary".format(missing)) with pd.get_store(store_path) as store: try: cached_cols = store.select(f.stem).columns newcols = set(cols) - set(cached_cols) - set(id_cols) if len(newcols) == 0: logger.info("Using cached {}".format(f.stem)) continue except KeyError: pass # Assuming no new rows df = par.read_monthly(str(f), sub_dd) fixups = settings['FIXUP_BY_DD'].get(dd_name) logger.info("Applying {} to {}".format(fixups, f.stem)) df = par.fixup_by_dd(df, fixups) # TODO: special stuff df = df.set_index(id_cols) par.write_monthly(df, store_path, f.stem) logging.info("Added {} to {}".format(f, settings['monthly_store']))