Beispiel #1
0
def download(overwrite_cached=False):
    settings = par.read_settings(str(_HERE_ / 'settings.json'))
    cached_dd = dl.check_cached(settings['dd_path'], kind='dictionary')
    cached_month = dl.check_cached(settings['monthly_path'], kind='data')

    dds = dl.all_monthly_files(kind='dictionary')
    dds = filter(itemgetter(1), dds)  # make sure not None cpsdec!
    dds = dl.filter_dds(dds, months=[par._month_to_dd(settings['date_start']),
                                     par._month_to_dd(settings['date_end'])])

    data = dl.all_monthly_files()
    data = dl.filter_monthly_files(data, months=[[settings['date_start'],
                                                  settings['date_end']]])
    if not overwrite_cached:
        def is_new(x, cache=None):
            return dl.rename_cps_monthly(x[1]) not in cache

        dds = filter(partial(is_new, cache=cached_dd), dds)
        data = filter(partial(is_new, cache=cached_month), data)

    for month, renamed in dds:
        dl.download_month(month, Path(settings['dd_path']))
        logging.info("Downloaded {}".format(renamed))

    for month, renamed in data:
        dl.download_month(month, Path(settings['monthly_path']))
        logging.info("Downloaded {}".format(renamed))
Beispiel #2
0
def download(kind, settings, overwrite=False):
    """
    Download files from NBER.

    kind : {'dictionary', 'data'}
    settings : dict
    overwrite : bool; default True
        Whether to overwrite existing files
    """
    s_path = {'dictionary': 'dd_path', 'data': 'monthly_path'}[kind]
    cached = dl.check_cached(settings[s_path], kind=kind)

    files = dl.all_monthly_files(kind=kind)
    if kind == 'dictionary':
        files = filter(itemgetter(1), files)   # make sure not None cpsdec!
        months = [par._month_to_dd(settings['date_start']),
                  par._month_to_dd(settings['date_end'])]
    else:
        months = [[settings['date_start'], settings['date_end']]]

    files = dl.filter_monthly(files, months=months, kind=kind)

    def is_new(x, cache=None):
        return dl.rename_cps_monthly(x[1]) not in cache

    for month, renamed in files:
        if is_new((month, renamed), cache=cached) or overwrite:
            dl.download_month(month, Path(settings[s_path]))
            logger.info("Downloaded {}".format(renamed))
        else:
            logger.info("Using cached {}".format(renamed))
Beispiel #3
0
 def test_month_to_dd(self):
     months = ['1989-01', '1989-03', '1989-12',
               '1992-01', '1992-02', '1993-12',
               '1994-01', '1994-02', '1994-03',
               '1994-04', '1994-05', '1995-05',
               '1995-06', '1995-07', '1995-08',
               '1995-09', '1996-01', '1997-12',
               '1998-01', '2000-01', '2002-12',
               '2003-01', '2004-02', '2004-04',
               '2004-05', '2004-06', '2005-07',
               '2005-08', '2005-09', '2005-10',
               '2005-11', '2006-01', '2006-12',
               '2007-01', '2008-09', '2008-12',
               '2009-01', '2009-06', '2009-12',
               '2010-01', '2010-11', '2012-02',
               '2012-05', '2012-07', '2012-12',
               '2013-01', '2013-02', '2013-03'
             ]
     dds = ["cpsm1989-01", "cpsm1992-01", "cpsm1994-01", "cpsm1994-04",
            "cpsm1995-06", "cpsm1995-09", "cpsm1998-01", "cpsm2003-01",
            "cpsm2004-05", "cpsm2005-08", "cpsm2005-11", "cpsm2007-01",
            "cpsm2009-01", "cpsm2010-01", "cpsm2012-05", "cpsm2013-01"] * 3
     dds = sorted(dds)
     for month, dd in zip(months, dds):
         result = p._month_to_dd(month)
         self.assertEqual(result, dd)
Beispiel #4
0
def parse():
    settings_file = str(_HERE_ / 'settings.json')
    settings = par.read_settings(settings_file)

    dd_path = Path(settings['dd_path'])
    dds = [x for x in dd_path.iterdir() if x.suffix in ('.ddf', '.asc')]
    monthly_path = Path(settings['monthly_path'])
    months = [x for x in monthly_path.iterdir() if x.suffix in ('.Z', '.zip')]

    settings['raise_warnings'] = False

    logger.info("Reading Data file")
    with (_HERE_ / 'data.json').open() as f:
        data = json.load(f)

    id_cols = ['HRHHID', 'HRHHID2', 'PULINENO']
    dds = dds + [_HERE_ / Path('cpsm2014-01.ddf')]

    for dd in dds:
        parser = par.DDParser(dd, settings)
        df = parser.run()
        parser.write(df)
        logging.info("Added {} to {}".format(dd, parser.store_path))

    for month in months:
        dd_name = par._month_to_dd(str(month))
        store_path = settings['monthly_store']

        dd = pd.read_hdf(settings['dd_store'], key=dd_name)
        cols = data['columns_by_dd'][dd_name]
        sub_dd = dd[dd.id.isin(cols)]

        if len(cols) != len(sub_dd):
            missing = set(cols) - set(sub_dd.id.values)
            raise ValueError("IDs {} are not in the Data "
                             "Dictionary".format(missing))

        with pd.get_store(store_path) as store:
            try:
                cached_cols = store.select(month.stem).columns
                newcols = set(cols) - set(cached_cols) - set(id_cols)
                if len(newcols) == 0:
                    logger.info("Using cached {}".format(month.stem))
                    continue

            except KeyError:
                pass

        # Assuming no new rows
        df = par.read_monthly(str(month), sub_dd)
        df = par.fixup_by_dd(df, dd_name)
        # do special stuff like compute HRHHID2, bin things, etc.
        # TODO: special stuff

        df = df.set_index(id_cols)
        par.write_monthly(df, store_path, month.stem)
        logging.info("Added {} to {}".format(month, settings['monthly_store']))
Beispiel #5
0
 def test_month_to_dd(self):
     months = [
         "1989-01",
         "1989-03",
         "1989-12",
         "1992-01",
         "1992-02",
         "1993-12",
         "1994-01",
         "1994-02",
         "1994-03",
         "1994-04",
         "1994-05",
         "1995-05",
         "1995-06",
         "1995-07",
         "1995-08",
         "1995-09",
         "1996-01",
         "1997-12",
         "1998-01",
         "2000-01",
         "2002-12",
         "2003-01",
         "2004-02",
         "2004-04",
         "2004-05",
         "2004-06",
         "2005-07",
         "2005-08",
         "2005-09",
         "2005-10",
         "2005-11",
         "2006-01",
         "2006-12",
         "2007-01",
         "2008-09",
         "2008-12",
         "2009-01",
         "2009-06",
         "2009-12",
         "2010-01",
         "2010-11",
         "2012-02",
         "2012-05",
         "2012-07",
         "2012-12",
         "2013-01",
         "2013-02",
         "2013-03",
     ]
     dds = [
         "cpsm1989-01",
         "cpsm1992-01",
         "cpsm1994-01",
         "cpsm1994-04",
         "cpsm1995-06",
         "cpsm1995-09",
         "cpsm1998-01",
         "cpsm2003-01",
         "cpsm2004-05",
         "cpsm2005-08",
         "cpsm2005-11",
         "cpsm2007-01",
         "cpsm2009-01",
         "cpsm2010-01",
         "cpsm2012-05",
         "cpsm2013-01",
     ] * 3
     dds = sorted(dds)
     for month, dd in zip(months, dds):
         result = p._month_to_dd(month)
         self.assertEqual(result, dd)
Beispiel #6
0
def parse(kind, settings, overwrite=False):
    """
    Parse downloaded files, store in HDFStore.

    Parameters
    ----------

    kind : {'dictionary', 'data'}
    settings : dict
    overwrite : bool
    """
    with open(settings['info_path']) as f:
        info = json.load(f)

    s_path = {'dictionary': 'dd_path', 'data': 'monthly_path'}[kind]
    path_ = Path(settings[s_path])

    if not path_.exists():
        path_.mkdir(parents=True)

    suffix_d = {'data': ('.Z', '.zip'), 'dictionary': ('.ddf', '.asc', '.txt')}
    suffixes = suffix_d[kind]

    files = [x for x in path_.iterdir() if x.suffix in suffixes]
    id_cols = ['HRHHID', 'HRHHID2', 'PULINENO']

    with open(settings['info']) as f:
        data = json.load(f)

    if kind == 'dictionary':
        files.append(_HERE_ / Path('cpsm2014-01.ddf'))
        for f in files:
            parser = par.DDParser(f, settings, info)
            df = parser.run()
            parser.write(df)
            logging.info("Added {} to {}".format(f, parser.store_path))
    else:
        for f in files:
            dd_name = par._month_to_dd(str(f))
            store_path = settings['monthly_store']

            dd = pd.read_hdf(settings['dd_store'], key=dd_name)
            cols = data['columns_by_dd'][dd_name]
            sub_dd = dd[dd.id.isin(cols)]

            if len(cols) != len(sub_dd):
                missing = set(cols) - set(sub_dd.id.values)
                raise ValueError("IDs {} are not in the Data "
                                 "Dictionary".format(missing))

            with pd.get_store(store_path) as store:
                try:
                    cached_cols = store.select(f.stem).columns
                    newcols = set(cols) - set(cached_cols) - set(id_cols)
                    if len(newcols) == 0:
                        logger.info("Using cached {}".format(f.stem))
                        continue

                except KeyError:
                    pass

            # Assuming no new rows
            df = par.read_monthly(str(f), sub_dd)

            fixups = settings['FIXUP_BY_DD'].get(dd_name)
            logger.info("Applying {} to {}".format(fixups, f.stem))
            df = par.fixup_by_dd(df, fixups)
            # TODO: special stuff

            df = df.set_index(id_cols)
            par.write_monthly(df, store_path, f.stem)
            logging.info("Added {} to {}".format(f, settings['monthly_store']))