def household(population, lookback, relations, table): """ Identify other members of the DHS household during the lookback period. """ with Connection() as cxn: cxn.clear_tables(table) sql = """ CREATE TABLE %table% NOLOGGING PCTFREE 0 PARALLEL AS SELECT DISTINCT pop.riipl_id, hh.riipl_id AS riipl_id_hh, hh.relation FROM %population% pop LEFT JOIN %lookback% lb ON pop.riipl_id = lb.riipl_id INNER JOIN %relations% dhs ON pop.riipl_id = dhs.riipl_id AND lb.yrmo = dhs.yrmo INNER JOIN %relations% hh ON dhs.dhs_hh_id = hh.dhs_hh_id AND dhs.yrmo = hh.yrmo AND dhs.riipl_id != hh.riipl_id """ cxn.execute(sql, verbose=True) cxn.save_table(table, None)
def main(): """ """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(cc.car_crash) AS car_crashes, SUM(cc.injured) AS car_crash_injuries FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {car_crashes} cc ON pop.riipl_id = cc.riipl_id AND cc.yrmo = lb.yrmo GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "CAR_CRASHES": "count of car crashes involved in during lookback period", "CAR_CRASH_INJURIES": "count of car crash injuries during lookback period", } SaveFeatures(features, out, manifest, population, labels)
def main(): """ Quarterly unemployment rate in state of Rhode Island taken from Bureau of Labor and Statistics """ index = ["RIIPL_ID"] unemp = pd.read_csv(unemp_file) unemp["YRMO"] = unemp.YRMO.astype(str) sql = """ SELECT pop.riipl_id, lb.yrmo FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id WHERE lb.timestep = {nsteps} - 1 ORDER BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = pd.read_sql(sql, cxn._connection) features = features.merge(unemp, how="left", on="YRMO")[["RIIPL_ID", "UNEMP_RATE"]] features = features.set_index("RIIPL_ID").rename( columns={"UNEMP_RATE": "UNEMP_RI"}) labels = {"UNEMP_RI": "Rhode Island monthly unemployment rate"} SaveFeatures(features, out, manifest, population, labels)
def main(): """ Indicator for UI exhaustion """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT DISTINCT pop.riipl_id, 1 AS ui_exhaustion FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {dlt_ui_payments} du ON pop.riipl_id = du.riipl_id AND dd.date_dt = du.trans_date WHERE void_check_flag IS NULL AND delete_flag IS NULL AND opening_credits > 0 AND closing_credits <= 0 """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection, index_col=index)).fillna(0) labels = {"UI_EXHAUSTION": "Exhausted UI benefits"} SaveFeatures(features, out, manifest, population, labels)
def main(): """ """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(citations) AS citations, SUM(fines) AS citation_fines FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {citations} c ON pop.riipl_id = c.riipl_id AND c.yrmo = lb.yrmo GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "CITATIONS": "Number of police citations", "CITATION_FINES": "Total fines from police citations" } SaveFeatures(features, out, manifest, population, labels)
def main(): """ Calculate correlation between codes and outcome for each code appearing in the training data. """ sql = """ SELECT pop.riipl_id, med.{cde_type}_cde AS cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {medicaid_cde} med ON pop.riipl_id = med.riipl_id AND med.claim_dt < pop.initial_dt AND pop.initial_dt - med.claim_dt <= 365 AND med.{cde_type}_cde IS NOT NULL WHERE pop.subset = 'TRAINING' GROUP BY pop.riipl_id, med.{cde_type}_cde """.format(**globals()) with cx_Oracle.connect("/") as cxn: codes = pd.read_sql(sql, cxn) riipl_ids = codes.RIIPL_ID.unique() print("unique RIIPL_IDs:", len(riipl_ids)) print("unique codes:", len(codes.CDE.unique())) print("total codes:", codes.N.sum()) codes = codes.pivot(index="RIIPL_ID", columns="CDE", values="N").fillna(0) sql = """ SELECT pop.riipl_id, outcome_any FROM {population} pop LEFT JOIN {outcomes} outcomes ON pop.riipl_id = outcomes.riipl_id WHERE pop.subset = 'TRAINING' """.format(**globals()) with cx_Oracle.connect("/") as cxn: outcomes = pd.read_sql(sql, cxn, index_col="RIIPL_ID") pd.testing.assert_index_equal(codes.index, outcomes.index) corr = [] for cde in codes.columns: corr.append(codes[cde].corr(outcomes["OUTCOME_ANY"])) corr = pd.DataFrame({ "{}_cde".format(cde_type): codes.columns, "corr": corr }) print(corr.head()) with Connection() as cxn: cxn.read_dataframe(corr, table) corr.to_csv(outfile, index=False)
def main(): """ Calculate correlation between codes and outcome for each code appearing in the training data. """ sql = """ SELECT pop.riipl_id, mc.{cde_type}_cde AS cde, COUNT(*) AS n_cde FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_cde} mc ON pop.riipl_id = mc.riipl_id AND dd.date_dt = mc.claim_dt WHERE pop.subset = 'TRAINING' AND mc.{cde_type}_cde IS NOT NULL GROUP BY pop.riipl_id, mc.{cde_type}_cde ORDER BY mc.{cde_type}_cde """.format(**globals()) with Connection() as cxn: counts = pd.read_sql(sql, cxn._connection, index_col="CDE") codes = counts.index.unique() print("unique samples:", len(counts.RIIPL_ID.unique())) print("unique codes:", len(codes)) print("total codes:", counts.N_CDE.sum()) sql = """ SELECT pop.riipl_id, outcome_any FROM {population} pop LEFT JOIN {outcomes} outcomes ON pop.riipl_id = outcomes.riipl_id WHERE pop.subset = 'TRAINING' """.format(**globals()) outcomes = pd.read_sql(sql, cxn._connection, index_col="RIIPL_ID") corr = [] for cde in codes: count = counts.loc[cde] if isinstance(count, pd.DataFrame): count = count.reset_index(drop=True).set_index("RIIPL_ID") count = outcomes.join(count, how="left").fillna(0) corr.append(count.N_CDE.corr(count.OUTCOME_ANY)) else: corr.append(np.nan) corr = pd.DataFrame({"{}_cde".format(cde_type): codes, "corr": corr}) cxn.read_dataframe(corr, table)
def main(): """ Nationwide unemployment rate by NAICS code at a yearly level from Bureau of Labor and Statistics """ index = ["RIIPL_ID"] unemp = pd.read_csv(unemp_file) unemp["NAICS"] = unemp.NAICS.astype(str) # Calculate average unemployment by year unemp_avg = unemp.groupby("YR").mean().reset_index() print(unemp_avg) # Lookup all wage records in the lookback period, and order by # wages, to select the NAICS for the highest wage. sql = """ SELECT DISTINCT pop.riipl_id, TO_CHAR(pop.initial_dt, 'YYYY') AS yr, dw.wages, SUBSTR(dw.naics4, 1, 2) AS naics FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dlt_wage} dw ON pop.riipl_id = dw.riipl_id AND lb.yyq = dw.yyq ORDER BY pop.riipl_id, dw.wages """.format(**globals()) with Connection() as cxn: features = pd.read_sql(sql, cxn._connection).drop_duplicates( index, keep="last").set_index(index) # Use the previous year for joining the unemployment rate features["YR"] = features.YR.astype(int) - 1 # Join NAICS unemployment, using avg unemployment when no NAICS is available naics = features.merge(unemp, how="left", on=["YR", "NAICS"]).NAICS_UNEMP_RATE avg = features.merge(unemp_avg, how="left", on="YR").NAICS_UNEMP_RATE features["UNEMP_NAICS"] = np.where(naics.notnull(), naics, avg) labels = { "UNEMP_NAICS": "unemployment rate by NAICS code at national level in year before index date" } SaveFeatures(features[["UNEMP_NAICS"]], out, manifest, population, labels)
def main(): icd9 = pd.concat([ pd.read_csv(icd9_file1, usecols=[0, 1], names=["DIAG_CDE", "DESC"]), pd.read_csv(icd9_file2) ]).drop_duplicates("DIAG_CDE").set_index("DIAG_CDE") index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, mdc.diag_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_diag_cde} mdc ON pop.riipl_id = mdc.riipl_id AND dd.date_dt = mdc.claim_dt INNER JOIN {diag_corr} dc ON mdc.diag_cde = dc.diag_cde AND ABS(dc.corr) > 0 GROUP BY pop.riipl_id, mdc.diag_cde """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) cdes = values.DIAG_CDE.unique() labels = {} for cde in cdes: feature = values.loc[values.DIAG_CDE == cde, "N"] feature.name = "DIAG_{}".format(cde) features = features.join(feature) try: desc = icd9.loc[cde, "DESC"] except KeyError: desc = "unknown description" label = "number of diagnoses for '{}' during lookback period".format( desc) labels[feature.name] = label features = features.fillna(0) SaveFeatures(features, out, manifest, population, labels)
def main(): procs = pd.read_csv(procsfile, sep="|", index_col="PROC_CDE") index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, mpc.proc_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_proc_cde} mpc ON pop.riipl_id = mpc.riipl_id AND dd.date_dt = mpc.claim_dt INNER JOIN {proc_corr} pc ON mpc.proc_cde = pc.proc_cde AND ABS(pc.corr) > 0 GROUP BY pop.riipl_id, mpc.proc_cde """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) cdes = values.PROC_CDE.unique() labels = {} for cde in cdes: feature = values.loc[values.PROC_CDE == cde, "N"] feature.name = "PROC_{}".format(cde) features = features.join(feature) try: desc = procs.loc[cde, "DESCRIPTION"] except KeyError: desc = "unknown procedure" label = "number of procedures for '{}' during lookback period".format( desc) labels[feature.name] = label features = features.fillna(0) SaveFeatures(features, out, manifest, population, labels)
def main(): """ Opioid prescriptions and adverse outcomes among DHS household """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(ABS(no.opioid)) AS dhs_hh_opioid_rx, SUM(no.recovery) AS dhs_hh_recovery_rx, COUNT(o.outcome_any_dt) AS dhs_hh_outcomes FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo LEFT JOIN {household} hh ON pop.riipl_id = hh.riipl_id LEFT JOIN {medicaid_pharmacy} mp ON hh.riipl_id_hh = mp.riipl_id AND dd.date_dt = mp.dispensed_dt LEFT JOIN {ndc_opioids} no ON mp.ndc9_code = no.ndc LEFT JOIN {outcomes} o ON hh.riipl_id_hh = o.riipl_id AND dd.date_dt = o.outcome_any_dt GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "DHS_HH_OPIOID_RX": "Number of opioid prescriptions in household", "DHS_HH_RECOVERY_RX": "Number of recovery prescriptions in household", "DHS_HH_OUTCOMES": "Number of adverse outcomes in household" } SaveFeatures(features, out, manifest, population, labels)
def DimDate(table): sql = """ CREATE TABLE %table% PCTFREE 0 NOLOGGING PARALLEL AS ( SELECT THEDATE AS DATE_DT ,CAST(TO_CHAR(THEDATE, 'YYYYMM') AS NUMBER(6)) AS YRMO ,TRUNC(THEDATE, 'Q') AS QTR_ST_DT ,CASE WHEN TO_NUMBER(TO_CHAR(THEDATE, 'YYYY')) > 1917 THEN TO_CHAR(THEDATE, 'YY') || TO_CHAR(THEDATE, 'Q') ELSE NULL END AS YYQ FROM (SELECT TO_DATE('31-DEC-1799') + LEVEL AS THEDATE FROM DUAL CONNECT BY LEVEL <= (SELECT TO_DATE('01-JAN-2101') - TO_DATE('01-JAN-1800') FROM DUAL) ) )""" with Connection() as cxn: cxn.clear_tables(table) cxn.execute(sql) cxn.save_table(table, "DATE_DT")
def Load(csvfile, schema, pk, delim, table): schema = list(map(str.strip, open(schema).readlines())) schema = dict(x.split(": ") for x in schema) with open(csvfile) as f: header = [ x.strip().replace('"', "").upper() for x in next(f).split(delim) ] schema_list = [] for column in header: if column in schema: schema_list.append([column, schema.pop(column)]) else: schema_list.append([column, "FILLER"]) if schema: raise ValueError( "columns {} in schema are missing from csv file".format( str(schema.keys()))) with Connection() as cxn: cxn.read_csv(csvfile, schema_list, table, delim) pk = pk.split(",") if pk[0] == "None": pk = None cxn.save_table(table, pk, checksum=False)
def main(): """ Size of DHS household and indicator for births in the DHS household from MEGA table """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, COUNT(DISTINCT m2.riipl_id) AS dhs_hh_size, MAX(CASE WHEN m1.age > 1 AND m2.age < 1 THEN 1 ELSE 0 END) AS dhs_hh_birth FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {mega} m1 ON pop.riipl_id = m1.riipl_id AND lb.yrmo = m1.month INNER JOIN {mega} m2 ON m1.dhs_hh_id = m2.dhs_hh_id AND m1.month = m2.month GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join(pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "DHS_HH_SIZE" : "Size of household", "DHS_HH_BIRTH" : "New birth in household" } SaveFeatures(features, out, manifest, population, labels, bool_features=["DHS_HH_BIRTH"])
def main(): """ This code generates the following features based on someone's home census block group: * Median income * Share below the federal povery line """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) income = pd.read_csv( income_file, skiprows=1, na_values=["-", "**"], usecols=[ "Id2", "Estimate; Median household income in the past 12 months (in 2015 Inflation-adjusted dollars)" ]) income.columns = ["GEO_ID", "BLKGRP_MEDIANINCOME"] fpl = pd.read_csv( fpl_file, skiprows=1, na_values=["-", "**"], usecols=[ "Id2", "Estimate; Total:", "Estimate; Income in the past 12 months below poverty level:" ]) fpl.columns = ["GEO_ID", "TOTAL", "FPL"] fpl["BLKGRP_BELOWFPL"] = fpl.FPL / fpl.TOTAL fpl = fpl[["GEO_ID", "BLKGRP_BELOWFPL"]] sql = """ SELECT pop.riipl_id, STATS_MODE(TO_NUMBER(a.state || a.county || a.trct || a.blkgrp)) AS geo_id, 1 AS geocoded FROM ( SELECT pop.riipl_id, MAX(a.obs_date) AS max_dt FROM {population} pop INNER JOIN {address} a ON pop.riipl_id = a.riipl_id AND a.obs_date <= pop.initial_dt GROUP BY pop.riipl_id ) pop INNER JOIN {address} a ON pop.riipl_id = a.riipl_id AND pop.max_dt = a.obs_date GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) values = values.merge(income, how="left", on="GEO_ID")\ .merge(fpl, how="left", on="GEO_ID") features = features.join(values.set_index(index)) del features["GEO_ID"] # Mean imputation features["GEOCODED"] = features.GEOCODED.fillna(0) features["BLKGRP_MEDIANINCOME"] = features.BLKGRP_MEDIANINCOME.fillna( features.BLKGRP_MEDIANINCOME.mean()) features["BLKGRP_BELOWFPL"] = features.BLKGRP_BELOWFPL.fillna( features.BLKGRP_BELOWFPL.mean()) labels = { "GEOCODED": "Has a geocoded home address", "BLKGRP_MEDIANINCOME": "Block group's median annual household income (2015 dollars)", "BLKGRP_BELOWFPL": "Share of block group below poverty line" } SaveFeatures(features, out, manifest, population, labels)
def main(): icd9 = pd.concat([pd.read_csv(icd9_file1).rename(columns={"DIAGNOSIS CODE": "DIAG_CDE", "LONG DESCRIPTION": "DESC"}), pd.read_csv(icd9_file2).rename(columns={"diag_cde": "DIAG_CDE", "description": "DESC"})], ignore_index=True).drop_duplicates("DIAG_CDE").set_index("DIAG_CDE") proc = pd.read_csv(proc_file, sep="|", index_col="PROC_CDE").rename(columns={"DESCRIPTION": "DESC"}) # Remove non-word characters and convert to lowercase icd9["DESC"] = icd9.DESC.str.lower().str.replace("[-/]", " ").str.replace("[^a-z ]", "") proc["DESC"] = proc.DESC.str.lower().str.replace("[-/]", " ").str.replace("[^a-z ]", "") # Split word lists into a row per word icd9 = pd.DataFrame(icd9.DESC.str.split().tolist(), index=icd9.index).stack() icd9 = icd9.reset_index()[["DIAG_CDE", 0]].rename(columns={0: "WORD"}) proc = pd.DataFrame(proc.DESC.str.split().tolist(), index=proc.index).stack() proc = proc.reset_index()[["PROC_CDE", 0]].rename(columns={0: "WORD"}) # Remove stopwords icd9 = icd9[~icd9.WORD.isin(stopwords)] proc = proc[~proc.WORD.isin(stopwords)] # Generate ids for words words = icd9.WORD.append(proc.WORD, ignore_index=True) words = words[words != ""] words = words.value_counts().reset_index().rename(columns={"index": "WORD", "WORD": "N"}) words["WORD_ID"] = np.arange(len(words)) words[["WORD_ID", "WORD", "N"]].to_csv(words_file, index=False) print("distinct words:", len(words)) icd9 = icd9.merge(words[["WORD", "WORD_ID"]], how="inner", on="WORD") proc = proc.merge(words[["WORD", "WORD_ID"]], how="inner", on="WORD") sql = """ SELECT pop.riipl_id, mc.diag_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_diag_cde} mc ON pop.riipl_id = mc.riipl_id AND dd.date_dt = mc.claim_dt GROUP BY pop.riipl_id, mc.diag_cde """.format(**globals()) with Connection() as cxn: diags = pd.read_sql(sql, cxn._connection) diags = diags.merge(icd9[["DIAG_CDE", "WORD_ID"]], how="inner", on="DIAG_CDE") sql = """ SELECT pop.riipl_id, mc.proc_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_proc_cde} mc ON pop.riipl_id = mc.riipl_id AND dd.date_dt = mc.claim_dt GROUP BY pop.riipl_id, mc.proc_cde """.format(**globals()) with Connection() as cxn: procs = pd.read_sql(sql, cxn._connection) procs = procs.merge(proc[["PROC_CDE", "WORD_ID"]], how="inner", on="PROC_CDE") columns = ["RIIPL_ID", "WORD_ID", "N"] values = diags[columns].append(procs[columns], ignore_index=True) values = values.groupby(["RIIPL_ID", "WORD_ID"]).sum().reset_index() pop = CachePopulation(population, "RIIPL_ID") pop["ROW_ID"] = np.arange(len(pop)) values = values.merge(pop, on="RIIPL_ID")[["ROW_ID", "WORD_ID", "N"]] print("total occurrences:", values.N.sum()) # MM uses 1-based indexing values["ROW_ID"] += 1 values["WORD_ID"] += 1 with open(counts_file, "w") as f: print("%%MatrixMarket matrix coordinate integer general", file=f) print(len(pop), len(words), len(values), file=f) values.to_csv(counts_file, sep=" ", mode="a", header=False, index=False)
import pandas as pd import os, sys, time from riipl import Connection ccs_file, cci_file, ccs_table, cci_table = sys.argv[1:] ccs = pd.read_csv(ccs_file, usecols=[0, 1], quotechar="'", skiprows=3, names=["DIAG_CDE", "DISEASE"]) ccs["DIAG_CDE"] = ccs.DIAG_CDE.str.strip() cci = pd.read_csv(cci_file, usecols=[0, 2], quotechar="'", skiprows=2, names=["DIAG_CDE", "CHRONIC"]) cci = cci[cci.CHRONIC == 1] cci["DIAG_CDE"] = cci.DIAG_CDE.str.strip() with Connection() as cxn: cxn.read_dataframe(ccs, ccs_table) cxn.save_table(ccs_table, "DIAG_CDE", checksum=False) cxn.read_dataframe(cci, cci_table) cxn.save_table(cci_table, "DIAG_CDE", checksum=False) # vim: expandtab sw=4 ts=4
def main(): ccs = pd.read_csv(ccs_file, usecols=[0, 1], quotechar="'", skiprows=3, names=["DIAG_CDE", "DISEASE"]) ccs["DIAG_CDE"] = ccs.DIAG_CDE.str.strip() cci = pd.read_csv(cci_file, usecols=[0, 2], quotechar="'", skiprows=2, names=["DIAG_CDE", "CHRONIC"]) cci = cci[cci.CHRONIC == 1] cci["DIAG_CDE"] = cci.DIAG_CDE.str.strip() index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT DISTINCT pop.riipl_id, d.diag_cde FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {diag_cde} d ON pop.riipl_id = d.riipl_id AND dd.date_dt = d.claim_dt """.format(**globals()) with Connection() as cxn: diags = pd.read_sql(sql, cxn._connection) ndisease = diags.merge(ccs, how="inner", on="DIAG_CDE")[index+["DISEASE"]]\ .groupby(index)\ .agg({"DISEASE": pd.Series.nunique})\ .rename(columns={"DISEASE": "MEDICAID_DISEASE_SCORE"}) features = features.join(ndisease) nchronic = diags.merge(cci, how="inner", on="DIAG_CDE")[index+["CHRONIC"]]\ .groupby(index)\ .sum()\ .rename(columns={"CHRONIC": "MEDICAID_CHRONIC_SCORE"}) features = features.join(nchronic) sql = """ SELECT pop.riipl_id, COUNT(DISTINCT p.proc_cde) AS medicaid_procedures FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {proc_cde} p ON pop.riipl_id = p.riipl_id AND dd.date_dt = p.claim_dt GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: nproc = pd.read_sql(sql, cxn._connection, index_col=index) features = features.join(nproc).fillna(0) # Add squared terms for x in [ "MEDICAID_DISEASE_SCORE", "MEDICAID_CHRONIC_SCORE", "MEDICAID_PROCEDURES" ]: features["{}_SQ".format(x)] = features[x] * features[x] labels = { "MEDICAID_DISEASE_SCORE": "Number of AHRQ CCS diseases", "MEDICAID_CHRONIC_SCORE": "Number of AHRQ CCI chronic conditions", "MEDICAID_PROCEDURES": "Number of distinct procedures", "MEDICAID_DISEASE_SCORE_SQ": "Squared number of AHRQ CCS diseases", "MEDICAID_CHRONIC_SCORE_SQ": "Squared number of AHRQ CCI chronic conditions", "MEDICAID_PROCEDURES_SQ": "Squared number of distinct procedures" } SaveFeatures(features, out, manifest, population, labels)
def main(): """ Demographics variables from MEGA table """ sql = """ SELECT pop.riipl_id, CASE WHEN MAX(m.age) >= 18 AND MAX(m.age) < 24 THEN 1 ELSE 0 END AS age_1, CASE WHEN MAX(m.age) >= 24 AND MAX(m.age) < 30 THEN 1 ELSE 0 END AS age_2, CASE WHEN MAX(m.age) >= 30 AND MAX(m.age) < 40 THEN 1 ELSE 0 END AS age_3, CASE WHEN MAX(m.age) >= 40 AND MAX(m.age) < 55 THEN 1 ELSE 0 END AS age_4, CASE WHEN MAX(m.age) >= 55 AND MAX(m.age) < 65 THEN 1 ELSE 0 END AS age_5, CASE WHEN MAX(m.age) >= 65 THEN 1 ELSE 0 END AS age_6, CASE WHEN AVG(m.bmi) < 18.5 THEN 1 ELSE 0 END AS bmi_under, CASE WHEN AVG(m.bmi) >= 25 AND AVG(m.bmi) < 30 THEN 1 ELSE 0 END AS bmi_over, CASE WHEN AVG(m.bmi) >= 30 THEN 1 ELSE 0 END AS bmi_obese, CASE WHEN COUNT(m.bmi) = 0 THEN 1 ELSE 0 END AS bmi_missing, MAX(CASE WHEN(m.marital_status) = 1 THEN 1 ELSE 0 END) AS married, CASE WHEN COUNT(m.marital_status) = 0 THEN 1 ELSE 0 END AS married_missing, MAX(CASE WHEN m.race = 'Black' THEN 1 ELSE 0 END) AS race_black, MAX(CASE WHEN m.race = 'Hispanic' THEN 1 ELSE 0 END) AS race_hispanic, MAX(CASE WHEN m.race = 'Asian' THEN 1 WHEN m.race = 'Native American' THEN 1 WHEN m.race = 'Other' THEN 1 ELSE 0 END) AS race_other, CASE WHEN COUNT(m.race) = 0 THEN 1 ELSE 0 END AS race_missing FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {mega} m ON pop.riipl_id = m.riipl_id AND lb.yrmo = m.month GROUP BY pop.riipl_id ORDER BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = pd.read_sql(sql, cxn._connection, index_col="RIIPL_ID") labels = { "AGE_1": "Age is 18-23", "AGE_2": "Age is 24-29", "AGE_3": "Age is 30-39", "AGE_4": "Age is 40-54", "AGE_5": "Age is 55-64", "AGE_6": "Age is 65+", "BMI_UNDER": "Body mass index is underweight", "BMI_OVER": "Body mass index is overweight", "BMI_OBESE": "Body mass index is obese", "BMI_MISSING": "Body mass index is missing", "MARRIED": "Married", "MARRIED_MISSING": "Marital status is missing", "RACE_BLACK": "Race is African American", "RACE_HISPANIC": "Ethnicity is Hispanic", "RACE_OTHER": "Race is Asian, Native American, or other", "RACE_MISSING": "Race is missing" } SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))