Ejemplo n.º 1
0
def parse_format_assignments(txt):
    assignments = thread_last(
        txt.split(';'),
        filter(lambda x: x.strip().lower().startswith('format')),
        mapcat(lambda x: x.lower().split('.')),
        map(lambda x: x.split()),  # break out vars and format
        (mapcat, lambda y: [(k, y[-1]) for k in y]),  # tuple of var, fmt
        dict
    )
    return assignments
Ejemplo n.º 2
0
def parse_questions(txt):
    rqt = re.compile(r'[\"\']')  # match quote chars
    assignments = thread_last(
        txt.split(';'),
        filter(lambda x: x.strip().lower().startswith('label')),
        mapcat(lambda x: x.lower().split('\n')),
        map(lambda x: x.split('=')),  # break out vars and format
        (map, lambda y: (y[0].strip().lower(), rqt.sub('', y[1].strip()))),  # tuple of var, fmt
        dict
    )
    return assignments
Ejemplo n.º 3
0
def varlabels2df(vlbls, yr=None):
    return thread_last(
        vlbls.items(),
        map(lambda k, v: pd.DataFrame({'code': list(v.keys()),
                                       'label': list(v.values()),
                                       'var': k})),
        map(lambda df: df.assign(year=yr) if yr else df),
        pd.concat,
        lambda df: (df.set_index(['var', 'year', 'code'])
                    if yr else df.set_index(['var', 'code']))
    )
Ejemplo n.º 4
0
def parse_variable_labels(txt, repl, lbls_to_lower=True):
    b2d = curry(block2dict)(repl=repl, to_lower=lbls_to_lower)
    labels = thread_last(
        txt.split(';'),
        filter(lambda x: x.strip().lower().startswith('value')),
        map(lambda x: x.strip().split('\n')),
        map(lambda x: (x[0].split()[1].lower(), b2d(x[1:]))),
        dict
    )
    logger.info('parsed varlabels from format txt',
                nlabeled=len(labels.keys()), nrepl=len(repl.keys()))
    return labels
Ejemplo n.º 5
0
def get_metadata_socrata_denovo(soc_cfg):
    g = soc_cfg
    revmap = {v: k for k, v in g.mapcols.items()}
    url = '{api_url}?' + \
          '$select={cols}' + \
          '&$order={ocols}'
    meta_diff = set(g.qn_meta).difference(g.computed)
    meta_diff = list(meta_diff)
    qncols = ','.join([(revmap[k] if
                        k in revmap else k) for
                       k in meta_diff])

    ocols = ','.join([revmap['qid'], 'year'])

    logger.info('loading SODA meta data')
    res = thread_last(
        g.soda_api,
        map(lambda x: url.format(api_url=x, cols=qncols, ocols=ocols)),
        map(dl.df_from_socrata_url),
        curry(pd.concat)(ignore_index=True))
    '''
        lambda xf: xf.applymap(lambda x: (re.sub('\xa0', '', x)).strip()),
        lambda xf: xf.rename(index=str, columns={x: x.lower() for x in
                                                 xf.columns}),
        lambda xf: xf if not g.mapcols else xf.rename(index=str,
                                                      columns=g.mapcols),
        curry(apply_fn2vals)(fns=g.apply_fn),
        lambda xf: xf if not g.mapvals else xf.replace(g.mapvals),
        lambda xf: xf if not g.mapvals else 
            xf.applymap(lambda x: g.mapvals[x.lower().strip()] if 
                        x.lower().strip() in g.mapvals else x),
        lambda xf: xf[g.qn_meta])
    '''
    logger.info('finished transformations', res=res.head())
    # pull out question -> response breakouts
    qns = res[['qid', 'year', 'topic',  
              'subtopic', 'question', 'response']].drop_duplicates().reset_index(drop=True)
    # since facets are questions as well
    # update the dict with response value from fc_res
    # overriding the original var (N.B.)
    yrvec = (res[['year']]
             .drop_duplicates()
             .assign(facet='year')
             .rename(index=str, columns={'year': 'facet_level'}))
    stvec = (res[['sitecode']]
             .drop_duplicates()
             .assign(facet='sitecode')
             .rename(index=str, columns={'sitecode':'facet_level'}))
    facs = pd.concat( [res[['facet', 'facet_level']].drop_duplicates(),
                       yrvec, stvec], axis=0).reset_index(drop=True)
    logger.info('created qn and facs', qn=qns.head(), fac=facs.head())
    return (qns, facs)
Ejemplo n.º 6
0
def get_qids_by_year(soc_cfg):
    g = soc_cfg
    revmap = {v: k for k, v in g.mapcols.items()}
    url = '{api_url}?' + \
          '$select=year,{qnkey},count(year)' + \
          '&$group=year,{qnkey}' + \
          '&$order={qnkey},year'
    qid = revmap['qid']
    df = thread_last(g.soda_api,
                     map(lambda x: url.format(api_url=x, qnkey=qid)),
                     map(dl.df_from_socrata_url),
                     curry(pd.concat)(ignore_index=True))
    df.to_csv(sys.stdout)
Ejemplo n.º 7
0
def load_variable_labels(format_f, formas_f, repl, year=None):
    logger.info("loading format labels", file=format_f)
    labels = thread_last(
        format_f,
        dl.fetch_data_from_url,
        lambda x: x.read(),
        lambda t: (t.decode('utf-8', errors='ignore')
                   if type(t) is bytes else t),
        curry(parse_variable_labels)(repl=repl)
    )
    logger.info("loaded format labels", lbls=labels)
    logger.info("loading format assignments", file=formas_f)
    assignments = thread_last(
        formas_f,
        dl.fetch_data_from_url,
        lambda x: x.read(),
        lambda t: (t.decode('utf-8', errors='ignore')
                   if type(t) is bytes else t),
        parse_format_assignments
    )
    logger.info("loaded format assns", ass=assignments)
    return {k: labels[v] for k, v in assignments.items() if v in labels}
Ejemplo n.º 8
0
def block2dict(lines, repl, to_lower=False):
    f_lwr = str.lower if to_lower else identity
    f_repl = curry(lambda k, r: r[k] if k in r else k)(r=repl)
    rqt = re.compile(r'[\"\']')  # match quote chars
    rws = re.compile(r'\s')        # match whitespace
    # keep only alnum and a few unreserved symbols
    ruri = re.compile(r'(?![\w\s\-\_\.\'\$\-\+\(\)\/]|\.).')
    d = thread_last(
        lines,
        map(lambda x: x.replace('\x92', "'")),
        map(lambda x: rqt.sub('', x.strip()).split('=')),
        map(lambda x: (rws.sub('', x[0].strip()), ruri.sub('', x[1].strip()))),
        filter(lambda x: x[0].find('-') == -1),  # no support for ranges
        (mapcat, lambda x: map(lambda y: (y, x[1]), x[0].split(','))),
        filter(lambda x: x[0].isnumeric()),  # remove non-numeric codes
        map(lambda x: (int(x[0]),  # cat codes are ints
                       pipe(x[1], f_lwr, f_repl))),
        dict
    )
    # d[-1] = np.nan #use NA as a marker for unmapped vals
    return d
Ejemplo n.º 9
0
def test_thread_last():
    assert list(thread_last([1, 2, 3], (map, inc), (filter, iseven))) == [2, 4]
    assert list(thread_last([1, 2, 3], (map, inc), (filter, isodd))) == [3]
    assert thread_last(2, (add, 5), double) == 14
Ejemplo n.º 10
0
def test_thread_last():
    assert list(thread_last([1, 2, 3], (map, inc), (filter, iseven))) == [2, 4]
    assert list(thread_last([1, 2, 3], (map, inc), (filter, isodd))) == [3]
    assert thread_last(2, (add, 5), double) == 14
Ejemplo n.º 11
0
import us
import pandas as pd
import numpy as np
from cytoolz.itertoolz import unique
from cytoolz.functoolz import thread_last, identity
from cytoolz.curried import map, filter, curry
from survey_stats import pdutil
# import sys
# import traceback as tb

from survey_stats import log

logger = log.getLogger(__name__)

US_STATES_FIPS_INTS = thread_last(us.STATES_AND_TERRITORIES,
                                  map(lambda x: x.fips),
                                  filter(lambda x: x is not None),
                                  map(lambda x: int(x)), list)

SITECODE_TRANSLATORS = {
    'fips':
    lambda x: (us.states.lookup('%.2d' % x).abbr
               if int(x) in US_STATES_FIPS_INTS else 'NA'),
    'codes':
    identity
}

SVYDESIGN_COLS = ['sitecode', 'strata', 'psu', 'weight']


def convert_cat_codes(s, fmt):
    unq_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())]))