Beispiel #1
0
def download(overwrite_cached=False):
    settings = par.read_settings(str(_HERE_ / 'settings.json'))
    cached_dd = dl.check_cached(settings['dd_path'], kind='dictionary')
    cached_month = dl.check_cached(settings['monthly_path'], kind='data')

    dds = dl.all_monthly_files(kind='dictionary')
    dds = filter(itemgetter(1), dds)  # make sure not None cpsdec!
    dds = dl.filter_dds(dds, months=[par._month_to_dd(settings['date_start']),
                                     par._month_to_dd(settings['date_end'])])

    data = dl.all_monthly_files()
    data = dl.filter_monthly_files(data, months=[[settings['date_start'],
                                                  settings['date_end']]])
    if not overwrite_cached:
        def is_new(x, cache=None):
            return dl.rename_cps_monthly(x[1]) not in cache

        dds = filter(partial(is_new, cache=cached_dd), dds)
        data = filter(partial(is_new, cache=cached_month), data)

    for month, renamed in dds:
        dl.download_month(month, Path(settings['dd_path']))
        logging.info("Downloaded {}".format(renamed))

    for month, renamed in data:
        dl.download_month(month, Path(settings['monthly_path']))
        logging.info("Downloaded {}".format(renamed))
Beispiel #2
0
def parse():
    settings_file = str(_HERE_ / 'settings.json')
    settings = par.read_settings(settings_file)

    dd_path = Path(settings['dd_path'])
    dds = [x for x in dd_path.iterdir() if x.suffix in ('.ddf', '.asc')]
    monthly_path = Path(settings['monthly_path'])
    months = [x for x in monthly_path.iterdir() if x.suffix in ('.Z', '.zip')]

    settings['raise_warnings'] = False

    logger.info("Reading Data file")
    with (_HERE_ / 'data.json').open() as f:
        data = json.load(f)

    id_cols = ['HRHHID', 'HRHHID2', 'PULINENO']
    dds = dds + [_HERE_ / Path('cpsm2014-01.ddf')]

    for dd in dds:
        parser = par.DDParser(dd, settings)
        df = parser.run()
        parser.write(df)
        logging.info("Added {} to {}".format(dd, parser.store_path))

    for month in months:
        dd_name = par._month_to_dd(str(month))
        store_path = settings['monthly_store']

        dd = pd.read_hdf(settings['dd_store'], key=dd_name)
        cols = data['columns_by_dd'][dd_name]
        sub_dd = dd[dd.id.isin(cols)]

        if len(cols) != len(sub_dd):
            missing = set(cols) - set(sub_dd.id.values)
            raise ValueError("IDs {} are not in the Data "
                             "Dictionary".format(missing))

        with pd.get_store(store_path) as store:
            try:
                cached_cols = store.select(month.stem).columns
                newcols = set(cols) - set(cached_cols) - set(id_cols)
                if len(newcols) == 0:
                    logger.info("Using cached {}".format(month.stem))
                    continue

            except KeyError:
                pass

        # Assuming no new rows
        df = par.read_monthly(str(month), sub_dd)
        df = par.fixup_by_dd(df, dd_name)
        # do special stuff like compute HRHHID2, bin things, etc.
        # TODO: special stuff

        df = df.set_index(id_cols)
        par.write_monthly(df, store_path, month.stem)
        logging.info("Added {} to {}".format(month, settings['monthly_store']))
Beispiel #3
0
def main(config):
    settings = par.read_settings(config.settings)
    overwrite = config.overwrite

    # Overwrite default info file?
    if config.info is not None:
        settings['info'] = config.info

    if config.monthly_data_fixups:
        import importlib
        fixup_file = config.monthly_data_fixups.strip('.py')
        user_fixups = importlib.import_module(fixup_file).FIXUP_BY_DD

        if config.append_fixups:
            # merge the user supplied with the defaults.
            from pycps.monthly_data_fixups import FIXUP_BY_DD

            for dd in FIXUP_BY_DD:
                new = user_fixups.get(dd)

                if new is not None:
                    for x in new:
                        FIXUP_BY_DD[dd].append(x)

        else:
            FIXUP_BY_DD = user_fixups.FIXUP_BY_DD
    else:
        from pycps.monthly_data_fixups import FIXUP_BY_DD

    # Fixups will be passed and accessed via settings
    settings['FIXUP_BY_DD'] = FIXUP_BY_DD

    if config.download_dictionaries:
        download('dictionary', settings, overwrite=overwrite)
    if config.download_monthly:
        download('data', settings, overwrite=overwrite)

    if config.parse_dictionaries:
        parse('dictionary', settings, overwrite=overwrite)
    if config.parse_monthly:
        parse('data', settings, overwrite=overwrite)

    if config.merge:
        merge(settings, overwrite=overwrite)
Beispiel #4
0
 def test_substitue(self):
     result = p.read_settings(self.settings_file)["dd_path"]
     expected = "data/data_dictionaries/"
     self.assertEqual(result, expected)
Beispiel #5
0
 def test_read_setting(self):
     result = p.read_settings(self.settings_file)["data_path"]
     expected = "data/"
     self.assertEqual(result, expected)
Beispiel #6
0
    def setUp(self):

        self.settings = par.read_settings(mdir + "/pycps/settings.json")
Beispiel #7
0
 def setUp(self):
     self.testfile = Path('files/cpsm2007-01.ddf')
     settings = p.read_settings(mdir + '/pycps/settings.json')
     with open(mdir + '/pycps/info.json') as f:
         info = json.load(f)
     self.parser = p.DDParser(self.testfile, settings, info)
Beispiel #8
0
 def test_substitue(self):
     result = p.read_settings(self.settings_file)['dd_path']
     expected = 'data/data_dictionaries/'
     self.assertEqual(result, expected)
Beispiel #9
0
 def test_read_setting(self):
     result = p.read_settings(self.settings_file)['data_path']
     expected = 'data/'
     self.assertEqual(result, expected)