def household(population, lookback, relations, table):
    """
    Identify other members of the DHS household during the lookback period.
    """
    with Connection() as cxn:
        cxn.clear_tables(table)
        sql = """
              CREATE TABLE %table% NOLOGGING PCTFREE 0 PARALLEL AS 
              SELECT DISTINCT
                     pop.riipl_id, 
                     hh.riipl_id AS riipl_id_hh,
                     hh.relation
                FROM %population% pop
           LEFT JOIN %lookback% lb
                  ON pop.riipl_id = lb.riipl_id
          INNER JOIN %relations% dhs
                  ON pop.riipl_id = dhs.riipl_id AND
                     lb.yrmo = dhs.yrmo
          INNER JOIN %relations% hh
                  ON dhs.dhs_hh_id = hh.dhs_hh_id AND
                     dhs.yrmo = hh.yrmo AND
                     dhs.riipl_id != hh.riipl_id
              """
        cxn.execute(sql, verbose=True)
        cxn.save_table(table, None)
Exemple #2
0
def main():
    """
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,   
                 SUM(cc.car_crash) AS car_crashes,
                 SUM(cc.injured)   AS car_crash_injuries
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
      INNER JOIN {car_crashes} cc
              ON pop.riipl_id = cc.riipl_id AND 
                 cc.yrmo = lb.yrmo
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(
            pd.read_sql(sql, cxn._connection).set_index(index))

    features = features.fillna(0)

    labels = {
        "CAR_CRASHES":
        "count of car crashes involved in during lookback period",
        "CAR_CRASH_INJURIES":
        "count of car crash injuries during lookback period",
    }

    SaveFeatures(features, out, manifest, population, labels)
Exemple #3
0
def main():
    """
    Quarterly unemployment rate in state of Rhode Island taken from Bureau of Labor and Statistics
    """
    index = ["RIIPL_ID"]

    unemp = pd.read_csv(unemp_file)
    unemp["YRMO"] = unemp.YRMO.astype(str)

    sql = """
          SELECT pop.riipl_id,
                 lb.yrmo
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
           WHERE lb.timestep = {nsteps} - 1
        ORDER BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = pd.read_sql(sql, cxn._connection)

    features = features.merge(unemp, how="left",
                              on="YRMO")[["RIIPL_ID", "UNEMP_RATE"]]
    features = features.set_index("RIIPL_ID").rename(
        columns={"UNEMP_RATE": "UNEMP_RI"})

    labels = {"UNEMP_RI": "Rhode Island monthly unemployment rate"}

    SaveFeatures(features, out, manifest, population, labels)
Exemple #4
0
def main():
    """
    Indicator for UI exhaustion
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT DISTINCT
                 pop.riipl_id,
                 1 AS ui_exhaustion
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {dlt_ui_payments} du
              ON pop.riipl_id = du.riipl_id AND
                 dd.date_dt = du.trans_date
          WHERE  void_check_flag IS NULL AND
                 delete_flag IS NULL AND
                 opening_credits > 0 AND
                 closing_credits <= 0
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(
            pd.read_sql(sql, cxn._connection, index_col=index)).fillna(0)

    labels = {"UI_EXHAUSTION": "Exhausted UI benefits"}

    SaveFeatures(features, out, manifest, population, labels)
Exemple #5
0
def main():
    """
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 SUM(citations) AS citations,
                 SUM(fines)     AS citation_fines
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {citations} c
              ON pop.riipl_id = c.riipl_id AND 
                 c.yrmo = lb.yrmo
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(
            pd.read_sql(sql, cxn._connection).set_index(index))

    features = features.fillna(0)

    labels = {
        "CITATIONS": "Number of police citations",
        "CITATION_FINES": "Total fines from police citations"
    }

    SaveFeatures(features, out, manifest, population, labels)
Exemple #6
0
def main():
    """
    Calculate correlation between codes and outcome for each code appearing
    in the training data.
    """

    sql = """
          SELECT pop.riipl_id,
                 med.{cde_type}_cde AS cde,
                 COUNT(*) AS n
            FROM {population} pop
       LEFT JOIN {medicaid_cde} med
              ON pop.riipl_id = med.riipl_id AND
                 med.claim_dt < pop.initial_dt AND
                 pop.initial_dt - med.claim_dt <= 365 AND
                 med.{cde_type}_cde IS NOT NULL
           WHERE pop.subset = 'TRAINING'
        GROUP BY pop.riipl_id,
                 med.{cde_type}_cde
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        codes = pd.read_sql(sql, cxn)

    riipl_ids = codes.RIIPL_ID.unique()
    print("unique RIIPL_IDs:", len(riipl_ids))
    print("unique codes:", len(codes.CDE.unique()))
    print("total codes:", codes.N.sum())

    codes = codes.pivot(index="RIIPL_ID", columns="CDE", values="N").fillna(0)

    sql = """
          SELECT pop.riipl_id,
                 outcome_any
            FROM {population} pop
       LEFT JOIN {outcomes} outcomes
              ON pop.riipl_id = outcomes.riipl_id
           WHERE pop.subset = 'TRAINING'
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        outcomes = pd.read_sql(sql, cxn, index_col="RIIPL_ID")

    pd.testing.assert_index_equal(codes.index, outcomes.index)

    corr = []
    for cde in codes.columns:
        corr.append(codes[cde].corr(outcomes["OUTCOME_ANY"]))

    corr = pd.DataFrame({
        "{}_cde".format(cde_type): codes.columns,
        "corr": corr
    })
    print(corr.head())

    with Connection() as cxn:
        cxn.read_dataframe(corr, table)

    corr.to_csv(outfile, index=False)
Exemple #7
0
def main():
    """
    Calculate correlation between codes and outcome for each code appearing
    in the training data.
    """

    sql = """
          SELECT pop.riipl_id,
                 mc.{cde_type}_cde AS cde,
                 COUNT(*) AS n_cde
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_cde} mc
              ON pop.riipl_id = mc.riipl_id AND
                 dd.date_dt = mc.claim_dt
           WHERE pop.subset = 'TRAINING' AND
                 mc.{cde_type}_cde IS NOT NULL
        GROUP BY pop.riipl_id,
                 mc.{cde_type}_cde
        ORDER BY mc.{cde_type}_cde
          """.format(**globals())

    with Connection() as cxn:
        counts = pd.read_sql(sql, cxn._connection, index_col="CDE")
        codes = counts.index.unique()

        print("unique samples:", len(counts.RIIPL_ID.unique()))
        print("unique codes:", len(codes))
        print("total codes:", counts.N_CDE.sum())

        sql = """
              SELECT pop.riipl_id,
                     outcome_any
                FROM {population} pop
           LEFT JOIN {outcomes} outcomes
                  ON pop.riipl_id = outcomes.riipl_id
               WHERE pop.subset = 'TRAINING'
              """.format(**globals())

        outcomes = pd.read_sql(sql, cxn._connection, index_col="RIIPL_ID")

        corr = []
        for cde in codes:
            count = counts.loc[cde]
            if isinstance(count, pd.DataFrame):
                count = count.reset_index(drop=True).set_index("RIIPL_ID")
                count = outcomes.join(count, how="left").fillna(0)
                corr.append(count.N_CDE.corr(count.OUTCOME_ANY))
            else:
                corr.append(np.nan)

        corr = pd.DataFrame({"{}_cde".format(cde_type): codes, "corr": corr})
        cxn.read_dataframe(corr, table)
Exemple #8
0
def main():
    """
    Nationwide unemployment rate by NAICS code at a yearly level from Bureau of Labor and Statistics
    """
    index = ["RIIPL_ID"]

    unemp = pd.read_csv(unemp_file)
    unemp["NAICS"] = unemp.NAICS.astype(str)

    # Calculate average unemployment by year
    unemp_avg = unemp.groupby("YR").mean().reset_index()
    print(unemp_avg)

    # Lookup all wage records in the lookback period, and order by
    # wages, to select the NAICS for the highest wage.
    sql = """
          SELECT DISTINCT
                 pop.riipl_id,
                 TO_CHAR(pop.initial_dt, 'YYYY') AS yr,
                 dw.wages,
                 SUBSTR(dw.naics4, 1, 2) AS naics
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dlt_wage} dw
              ON pop.riipl_id = dw.riipl_id AND
                 lb.yyq = dw.yyq
        ORDER BY pop.riipl_id, dw.wages
           """.format(**globals())

    with Connection() as cxn:
        features = pd.read_sql(sql, cxn._connection).drop_duplicates(
            index, keep="last").set_index(index)

    # Use the previous year for joining the unemployment rate
    features["YR"] = features.YR.astype(int) - 1

    # Join NAICS unemployment, using avg unemployment when no NAICS is available
    naics = features.merge(unemp, how="left", on=["YR",
                                                  "NAICS"]).NAICS_UNEMP_RATE

    avg = features.merge(unemp_avg, how="left", on="YR").NAICS_UNEMP_RATE

    features["UNEMP_NAICS"] = np.where(naics.notnull(), naics, avg)

    labels = {
        "UNEMP_NAICS":
        "unemployment rate by NAICS code at national level in year before index date"
    }

    SaveFeatures(features[["UNEMP_NAICS"]], out, manifest, population, labels)
Exemple #9
0
def main():

    icd9 = pd.concat([
        pd.read_csv(icd9_file1, usecols=[0, 1], names=["DIAG_CDE", "DESC"]),
        pd.read_csv(icd9_file2)
    ]).drop_duplicates("DIAG_CDE").set_index("DIAG_CDE")

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 mdc.diag_cde,
                 COUNT(*) AS n
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_diag_cde} mdc
              ON pop.riipl_id = mdc.riipl_id AND
                 dd.date_dt = mdc.claim_dt
      INNER JOIN {diag_corr} dc
              ON mdc.diag_cde = dc.diag_cde AND
                 ABS(dc.corr) > 0
        GROUP BY pop.riipl_id, mdc.diag_cde
              """.format(**globals())

    with Connection() as cxn:
        values = pd.read_sql(sql, cxn._connection)

    cdes = values.DIAG_CDE.unique()

    labels = {}
    for cde in cdes:
        feature = values.loc[values.DIAG_CDE == cde, "N"]
        feature.name = "DIAG_{}".format(cde)
        features = features.join(feature)
        try:
            desc = icd9.loc[cde, "DESC"]
        except KeyError:
            desc = "unknown description"
        label = "number of diagnoses for '{}' during lookback period".format(
            desc)
        labels[feature.name] = label

    features = features.fillna(0)

    SaveFeatures(features, out, manifest, population, labels)
Exemple #10
0
def main():

    procs = pd.read_csv(procsfile, sep="|", index_col="PROC_CDE")

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 mpc.proc_cde,
                 COUNT(*) AS n
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_proc_cde} mpc
              ON pop.riipl_id = mpc.riipl_id AND
                 dd.date_dt = mpc.claim_dt
      INNER JOIN {proc_corr} pc
              ON mpc.proc_cde = pc.proc_cde AND
                 ABS(pc.corr) > 0
        GROUP BY pop.riipl_id, mpc.proc_cde
          """.format(**globals())

    with Connection() as cxn:
        values = pd.read_sql(sql, cxn._connection)

    cdes = values.PROC_CDE.unique()

    labels = {}
    for cde in cdes:
        feature = values.loc[values.PROC_CDE == cde, "N"]
        feature.name = "PROC_{}".format(cde)
        features = features.join(feature)
        try:
            desc = procs.loc[cde, "DESCRIPTION"]
        except KeyError:
            desc = "unknown procedure"
        label = "number of procedures for '{}' during lookback period".format(
            desc)
        labels[feature.name] = label

    features = features.fillna(0)

    SaveFeatures(features, out, manifest, population, labels)
Exemple #11
0
def main():
    """
    Opioid prescriptions and adverse outcomes among DHS household
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 SUM(ABS(no.opioid))     AS dhs_hh_opioid_rx,
                 SUM(no.recovery)        AS dhs_hh_recovery_rx,
                 COUNT(o.outcome_any_dt) AS dhs_hh_outcomes
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
       LEFT JOIN {household} hh
              ON pop.riipl_id = hh.riipl_id
       LEFT JOIN {medicaid_pharmacy} mp
              ON hh.riipl_id_hh = mp.riipl_id AND
                 dd.date_dt = mp.dispensed_dt
       LEFT JOIN {ndc_opioids} no
              ON mp.ndc9_code = no.ndc
       LEFT JOIN {outcomes} o
              ON hh.riipl_id_hh = o.riipl_id AND
                 dd.date_dt = o.outcome_any_dt
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(
            pd.read_sql(sql, cxn._connection).set_index(index))

    features = features.fillna(0)

    labels = {
        "DHS_HH_OPIOID_RX": "Number of opioid prescriptions in household",
        "DHS_HH_RECOVERY_RX": "Number of recovery prescriptions in household",
        "DHS_HH_OUTCOMES": "Number of adverse outcomes in household"
    }

    SaveFeatures(features, out, manifest, population, labels)
Exemple #12
0
def DimDate(table):

    sql = """
CREATE TABLE %table% PCTFREE 0 NOLOGGING PARALLEL AS (
SELECT THEDATE                                                  AS DATE_DT
      ,CAST(TO_CHAR(THEDATE, 'YYYYMM') AS NUMBER(6))            AS YRMO
      ,TRUNC(THEDATE, 'Q')                                      AS QTR_ST_DT
      ,CASE WHEN TO_NUMBER(TO_CHAR(THEDATE, 'YYYY')) > 1917
            THEN TO_CHAR(THEDATE, 'YY') || 
                 TO_CHAR(THEDATE, 'Q')
            ELSE NULL
       END                                                      AS YYQ
  FROM (SELECT TO_DATE('31-DEC-1799') + LEVEL AS THEDATE
          FROM DUAL
       CONNECT BY LEVEL <= (SELECT TO_DATE('01-JAN-2101') - TO_DATE('01-JAN-1800')
                              FROM DUAL)
       )
)"""

    with Connection() as cxn:
        cxn.clear_tables(table)
        cxn.execute(sql)
        cxn.save_table(table, "DATE_DT")
Exemple #13
0
def Load(csvfile, schema, pk, delim, table):
    schema = list(map(str.strip, open(schema).readlines()))
    schema = dict(x.split(": ") for x in schema)
    with open(csvfile) as f:
        header = [
            x.strip().replace('"', "").upper() for x in next(f).split(delim)
        ]
    schema_list = []
    for column in header:
        if column in schema:
            schema_list.append([column, schema.pop(column)])
        else:
            schema_list.append([column, "FILLER"])
    if schema:
        raise ValueError(
            "columns {} in schema are missing from csv file".format(
                str(schema.keys())))
    with Connection() as cxn:
        cxn.read_csv(csvfile, schema_list, table, delim)
        pk = pk.split(",")
        if pk[0] == "None":
            pk = None
        cxn.save_table(table, pk, checksum=False)
Exemple #14
0
def main():
    """
    Size of DHS household and indicator for births in the DHS household from MEGA table
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 COUNT(DISTINCT m2.riipl_id) AS dhs_hh_size,
                 MAX(CASE WHEN m1.age > 1 AND m2.age < 1
                          THEN 1
                          ELSE 0 END)        AS dhs_hh_birth
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
      INNER JOIN {mega} m1
              ON pop.riipl_id = m1.riipl_id AND
                 lb.yrmo = m1.month
      INNER JOIN {mega} m2
              ON m1.dhs_hh_id = m2.dhs_hh_id AND
                 m1.month = m2.month
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(pd.read_sql(sql, cxn._connection).set_index(index))

    features = features.fillna(0)

    labels = {
        "DHS_HH_SIZE"  : "Size of household",
        "DHS_HH_BIRTH" : "New birth in household"
    }

    SaveFeatures(features, out, manifest, population, labels, bool_features=["DHS_HH_BIRTH"])
Exemple #15
0
def main():
    """
    This code generates the following features based on someone's home census block group:
    * Median income
    * Share below the federal povery line
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    income = pd.read_csv(
        income_file,
        skiprows=1,
        na_values=["-", "**"],
        usecols=[
            "Id2",
            "Estimate; Median household income in the past 12 months (in 2015 Inflation-adjusted dollars)"
        ])
    income.columns = ["GEO_ID", "BLKGRP_MEDIANINCOME"]

    fpl = pd.read_csv(
        fpl_file,
        skiprows=1,
        na_values=["-", "**"],
        usecols=[
            "Id2", "Estimate; Total:",
            "Estimate; Income in the past 12 months below poverty level:"
        ])
    fpl.columns = ["GEO_ID", "TOTAL", "FPL"]
    fpl["BLKGRP_BELOWFPL"] = fpl.FPL / fpl.TOTAL
    fpl = fpl[["GEO_ID", "BLKGRP_BELOWFPL"]]

    sql = """
          SELECT pop.riipl_id,
                 STATS_MODE(TO_NUMBER(a.state || a.county || a.trct || a.blkgrp)) AS geo_id,
                 1 AS geocoded
            FROM (
                  SELECT pop.riipl_id,
                         MAX(a.obs_date) AS max_dt
                    FROM {population} pop
              INNER JOIN {address} a
                      ON pop.riipl_id = a.riipl_id AND
                         a.obs_date <= pop.initial_dt
                GROUP BY pop.riipl_id
                 ) pop
      INNER JOIN {address} a
              ON pop.riipl_id = a.riipl_id AND
                 pop.max_dt = a.obs_date
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        values = pd.read_sql(sql, cxn._connection)

    values = values.merge(income, how="left", on="GEO_ID")\
                   .merge(fpl,    how="left", on="GEO_ID")

    features = features.join(values.set_index(index))
    del features["GEO_ID"]

    # Mean imputation
    features["GEOCODED"] = features.GEOCODED.fillna(0)
    features["BLKGRP_MEDIANINCOME"] = features.BLKGRP_MEDIANINCOME.fillna(
        features.BLKGRP_MEDIANINCOME.mean())
    features["BLKGRP_BELOWFPL"] = features.BLKGRP_BELOWFPL.fillna(
        features.BLKGRP_BELOWFPL.mean())

    labels = {
        "GEOCODED": "Has a geocoded home address",
        "BLKGRP_MEDIANINCOME":
        "Block group's median annual household income (2015 dollars)",
        "BLKGRP_BELOWFPL": "Share of block group below poverty line"
    }

    SaveFeatures(features, out, manifest, population, labels)
Exemple #16
0
def main():

    icd9 = pd.concat([pd.read_csv(icd9_file1).rename(columns={"DIAGNOSIS CODE": "DIAG_CDE", "LONG DESCRIPTION": "DESC"}),
                      pd.read_csv(icd9_file2).rename(columns={"diag_cde": "DIAG_CDE", "description": "DESC"})],
                     ignore_index=True).drop_duplicates("DIAG_CDE").set_index("DIAG_CDE")

    proc = pd.read_csv(proc_file, sep="|", index_col="PROC_CDE").rename(columns={"DESCRIPTION": "DESC"})

    # Remove non-word characters and convert to lowercase
    icd9["DESC"] = icd9.DESC.str.lower().str.replace("[-/]", " ").str.replace("[^a-z ]", "")
    proc["DESC"] = proc.DESC.str.lower().str.replace("[-/]", " ").str.replace("[^a-z ]", "")

    # Split word lists into a row per word
    icd9 = pd.DataFrame(icd9.DESC.str.split().tolist(), index=icd9.index).stack()
    icd9 = icd9.reset_index()[["DIAG_CDE", 0]].rename(columns={0: "WORD"})
    proc = pd.DataFrame(proc.DESC.str.split().tolist(), index=proc.index).stack()
    proc = proc.reset_index()[["PROC_CDE", 0]].rename(columns={0: "WORD"})

    # Remove stopwords
    icd9 = icd9[~icd9.WORD.isin(stopwords)]
    proc = proc[~proc.WORD.isin(stopwords)]

    # Generate ids for words
    words = icd9.WORD.append(proc.WORD, ignore_index=True)
    words = words[words != ""]
    words = words.value_counts().reset_index().rename(columns={"index": "WORD", "WORD": "N"})
    words["WORD_ID"] = np.arange(len(words))
    words[["WORD_ID", "WORD", "N"]].to_csv(words_file, index=False)
    print("distinct words:", len(words))

    icd9 = icd9.merge(words[["WORD", "WORD_ID"]], how="inner", on="WORD")
    proc = proc.merge(words[["WORD", "WORD_ID"]], how="inner", on="WORD")

    sql = """
          SELECT pop.riipl_id,
                 mc.diag_cde,
                 COUNT(*) AS n
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_diag_cde} mc
              ON pop.riipl_id = mc.riipl_id AND
                 dd.date_dt = mc.claim_dt
        GROUP BY pop.riipl_id, mc.diag_cde
              """.format(**globals())

    with Connection() as cxn:
        diags = pd.read_sql(sql, cxn._connection)

    diags = diags.merge(icd9[["DIAG_CDE", "WORD_ID"]], how="inner", on="DIAG_CDE")

    sql = """
          SELECT pop.riipl_id,
                 mc.proc_cde,
                 COUNT(*) AS n
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_proc_cde} mc
              ON pop.riipl_id = mc.riipl_id AND
                 dd.date_dt = mc.claim_dt
        GROUP BY pop.riipl_id, mc.proc_cde
              """.format(**globals())

    with Connection() as cxn:
        procs = pd.read_sql(sql, cxn._connection)

    procs = procs.merge(proc[["PROC_CDE", "WORD_ID"]], how="inner", on="PROC_CDE")

    columns = ["RIIPL_ID", "WORD_ID", "N"]
    values = diags[columns].append(procs[columns], ignore_index=True)
    values = values.groupby(["RIIPL_ID", "WORD_ID"]).sum().reset_index()

    pop = CachePopulation(population, "RIIPL_ID")
    pop["ROW_ID"] = np.arange(len(pop))

    values = values.merge(pop, on="RIIPL_ID")[["ROW_ID", "WORD_ID", "N"]]
    print("total occurrences:", values.N.sum())

    # MM uses 1-based indexing
    values["ROW_ID"] += 1
    values["WORD_ID"] += 1

    with open(counts_file, "w") as f:
        print("%%MatrixMarket matrix coordinate integer general", file=f)
        print(len(pop), len(words), len(values), file=f)

    values.to_csv(counts_file, sep=" ", mode="a", header=False, index=False)
Exemple #17
0
import pandas as pd
import os, sys, time
from riipl import Connection

ccs_file, cci_file, ccs_table, cci_table = sys.argv[1:]

ccs = pd.read_csv(ccs_file,
                  usecols=[0, 1],
                  quotechar="'",
                  skiprows=3,
                  names=["DIAG_CDE", "DISEASE"])
ccs["DIAG_CDE"] = ccs.DIAG_CDE.str.strip()

cci = pd.read_csv(cci_file,
                  usecols=[0, 2],
                  quotechar="'",
                  skiprows=2,
                  names=["DIAG_CDE", "CHRONIC"])
cci = cci[cci.CHRONIC == 1]
cci["DIAG_CDE"] = cci.DIAG_CDE.str.strip()

with Connection() as cxn:
    cxn.read_dataframe(ccs, ccs_table)
    cxn.save_table(ccs_table, "DIAG_CDE", checksum=False)
    cxn.read_dataframe(cci, cci_table)
    cxn.save_table(cci_table, "DIAG_CDE", checksum=False)

# vim: expandtab sw=4 ts=4
Exemple #18
0
def main():

    ccs = pd.read_csv(ccs_file,
                      usecols=[0, 1],
                      quotechar="'",
                      skiprows=3,
                      names=["DIAG_CDE", "DISEASE"])
    ccs["DIAG_CDE"] = ccs.DIAG_CDE.str.strip()

    cci = pd.read_csv(cci_file,
                      usecols=[0, 2],
                      quotechar="'",
                      skiprows=2,
                      names=["DIAG_CDE", "CHRONIC"])
    cci = cci[cci.CHRONIC == 1]
    cci["DIAG_CDE"] = cci.DIAG_CDE.str.strip()

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT DISTINCT
                 pop.riipl_id,
                 d.diag_cde
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {diag_cde} d
              ON pop.riipl_id = d.riipl_id AND
                 dd.date_dt = d.claim_dt
          """.format(**globals())

    with Connection() as cxn:
        diags = pd.read_sql(sql, cxn._connection)

    ndisease = diags.merge(ccs, how="inner", on="DIAG_CDE")[index+["DISEASE"]]\
                    .groupby(index)\
                    .agg({"DISEASE": pd.Series.nunique})\
                    .rename(columns={"DISEASE": "MEDICAID_DISEASE_SCORE"})
    features = features.join(ndisease)

    nchronic = diags.merge(cci, how="inner", on="DIAG_CDE")[index+["CHRONIC"]]\
                    .groupby(index)\
                    .sum()\
                    .rename(columns={"CHRONIC": "MEDICAID_CHRONIC_SCORE"})
    features = features.join(nchronic)

    sql = """
          SELECT pop.riipl_id,
                 COUNT(DISTINCT p.proc_cde) AS medicaid_procedures
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {proc_cde} p
              ON pop.riipl_id = p.riipl_id AND
                 dd.date_dt = p.claim_dt
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        nproc = pd.read_sql(sql, cxn._connection, index_col=index)

    features = features.join(nproc).fillna(0)

    # Add squared terms
    for x in [
            "MEDICAID_DISEASE_SCORE", "MEDICAID_CHRONIC_SCORE",
            "MEDICAID_PROCEDURES"
    ]:
        features["{}_SQ".format(x)] = features[x] * features[x]

    labels = {
        "MEDICAID_DISEASE_SCORE": "Number of AHRQ CCS diseases",
        "MEDICAID_CHRONIC_SCORE": "Number of AHRQ CCI chronic conditions",
        "MEDICAID_PROCEDURES": "Number of distinct procedures",
        "MEDICAID_DISEASE_SCORE_SQ": "Squared number of AHRQ CCS diseases",
        "MEDICAID_CHRONIC_SCORE_SQ":
        "Squared number of AHRQ CCI chronic conditions",
        "MEDICAID_PROCEDURES_SQ": "Squared number of distinct procedures"
    }

    SaveFeatures(features, out, manifest, population, labels)
Exemple #19
0
def main():
    """
    Demographics variables from MEGA table
    """

    sql = """
          SELECT pop.riipl_id,
                 CASE WHEN MAX(m.age) >= 18 AND MAX(m.age) < 24 THEN 1 ELSE 0 END  AS age_1,
                 CASE WHEN MAX(m.age) >= 24 AND MAX(m.age) < 30 THEN 1 ELSE 0 END  AS age_2,
                 CASE WHEN MAX(m.age) >= 30 AND MAX(m.age) < 40 THEN 1 ELSE 0 END  AS age_3,
                 CASE WHEN MAX(m.age) >= 40 AND MAX(m.age) < 55 THEN 1 ELSE 0 END  AS age_4,
                 CASE WHEN MAX(m.age) >= 55 AND MAX(m.age) < 65 THEN 1 ELSE 0 END  AS age_5,
                 CASE WHEN MAX(m.age) >= 65                     THEN 1 ELSE 0 END  AS age_6,
                 CASE WHEN AVG(m.bmi) < 18.5                    THEN 1 ELSE 0 END  AS bmi_under,
                 CASE WHEN AVG(m.bmi) >= 25 AND AVG(m.bmi) < 30 THEN 1 ELSE 0 END  AS bmi_over,
                 CASE WHEN AVG(m.bmi) >= 30                     THEN 1 ELSE 0 END  AS bmi_obese,
                 CASE WHEN COUNT(m.bmi) = 0                     THEN 1 ELSE 0 END  AS bmi_missing,
                 MAX(CASE WHEN(m.marital_status) = 1            THEN 1 ELSE 0 END) AS married,
                 CASE WHEN COUNT(m.marital_status) = 0          THEN 1 ELSE 0 END  AS married_missing,
                 MAX(CASE WHEN m.race = 'Black'                 THEN 1 ELSE 0 END) AS race_black,
                 MAX(CASE WHEN m.race = 'Hispanic'              THEN 1 ELSE 0 END) AS race_hispanic,
                 MAX(CASE WHEN m.race = 'Asian'                 THEN 1
                          WHEN m.race = 'Native American'       THEN 1
                          WHEN m.race = 'Other'                 THEN 1 ELSE 0 END) AS race_other,
                 CASE WHEN COUNT(m.race) = 0                    THEN 1 ELSE 0 END  AS race_missing
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {mega} m
              ON pop.riipl_id = m.riipl_id AND
                 lb.yrmo = m.month
        GROUP BY pop.riipl_id
        ORDER BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = pd.read_sql(sql, cxn._connection, index_col="RIIPL_ID")

    labels = {
        "AGE_1": "Age is 18-23",
        "AGE_2": "Age is 24-29",
        "AGE_3": "Age is 30-39",
        "AGE_4": "Age is 40-54",
        "AGE_5": "Age is 55-64",
        "AGE_6": "Age is 65+",
        "BMI_UNDER": "Body mass index is underweight",
        "BMI_OVER": "Body mass index is overweight",
        "BMI_OBESE": "Body mass index is obese",
        "BMI_MISSING": "Body mass index is missing",
        "MARRIED": "Married",
        "MARRIED_MISSING": "Marital status is missing",
        "RACE_BLACK": "Race is African American",
        "RACE_HISPANIC": "Ethnicity is Hispanic",
        "RACE_OTHER": "Race is Asian, Native American, or other",
        "RACE_MISSING": "Race is missing"
    }

    SaveFeatures(features,
                 out,
                 manifest,
                 population,
                 labels,
                 bool_features=list(labels))