def main(): """ Quarterly unemployment rate in state of Rhode Island taken from Bureau of Labor and Statistics """ index = ["RIIPL_ID"] unemp = pd.read_csv(unemp_file) unemp["YRMO"] = unemp.YRMO.astype(str) sql = """ SELECT pop.riipl_id, lb.yrmo FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id WHERE lb.timestep = {nsteps} - 1 ORDER BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = pd.read_sql(sql, cxn._connection) features = features.merge(unemp, how="left", on="YRMO")[["RIIPL_ID", "UNEMP_RATE"]] features = features.set_index("RIIPL_ID").rename( columns={"UNEMP_RATE": "UNEMP_RI"}) labels = {"UNEMP_RI": "Rhode Island monthly unemployment rate"} SaveFeatures(features, out, manifest, population, labels)
def main(): """ Indicators for DOC sentences by crime type """ sql = """ SELECT DISTINCT offense_type AS offense FROM {doc_sentences} WHERE offense_type IS NOT NULL ORDER BY offense """.format(**globals()) with cx_Oracle.connect("/") as cxn: offenses = pd.read_sql(sql, cxn) print(offenses) pivot = ",".join([ "'{0}' AS doc_sentenced_{0}".format(offense) for offense in offenses["OFFENSE"] ]) index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT * FROM ( SELECT pop.riipl_id, ds.offense_type AS offense, 1 AS sentenced FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {doc_sentences} ds ON pop.riipl_id = ds.riipl_id AND dd.date_dt = ds.imposed_date GROUP BY pop.riipl_id, ds.offense_type ) PIVOT (MAX(sentenced) FOR offense IN ({pivot})) """.format(pivot=pivot, **globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Drop offense types that are missing for the entire population. features = features.dropna(axis=1, how="all").fillna(0).astype(int) labels = {} for offense in offenses["OFFENSE"]: label = "sentenced for {} during lookback period".format(offense) labels["DOC_SENTENCED_{}".format(offense.upper())] = label SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
def main(): """ Indicator for UI exhaustion """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT DISTINCT pop.riipl_id, 1 AS ui_exhaustion FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {dlt_ui_payments} du ON pop.riipl_id = du.riipl_id AND dd.date_dt = du.trans_date WHERE void_check_flag IS NULL AND delete_flag IS NULL AND opening_credits > 0 AND closing_credits <= 0 """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection, index_col=index)).fillna(0) labels = {"UI_EXHAUSTION": "Exhausted UI benefits"} SaveFeatures(features, out, manifest, population, labels)
def main(): """ """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(cc.car_crash) AS car_crashes, SUM(cc.injured) AS car_crash_injuries FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {car_crashes} cc ON pop.riipl_id = cc.riipl_id AND cc.yrmo = lb.yrmo GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "CAR_CRASHES": "count of car crashes involved in during lookback period", "CAR_CRASH_INJURIES": "count of car crash injuries during lookback period", } SaveFeatures(features, out, manifest, population, labels)
def main(): sql = """ SELECT pop.riipl_id, age, race, sex, marital_status, bmi FROM {population} pop LEFT JOIN {dim_date} dd ON pop.initial_dt = dd.date_dt LEFT JOIN {mega} mega ON pop.riipl_id = mega.riipl_id AND dd.yrmo = mega.month ORDER BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = pd.read_sql(sql, cxn, index_col="RIIPL_ID") labels = { "AGE": "Age at first prescription", "RACE": "Race (Asian, Black, Hispanic, Native American, Other, White)", "SEX": "Sex at first prescription", "MARITAL_STATUS": "Marital status at first prescription", "BMI": "Body Mass Index at first prescription" } SaveFeatures(features, outfile, manifest, population, labels)
def main(): """ Home addresses from MEGA table """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT DISTINCT pop.riipl_id, address_trct FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {mega} m ON pop.riipl_id = m.riipl_id AND lb.yrmo = m.month WHERE m.address_trct IS NOT NULL """.format(**globals()) with cx_Oracle.connect("/") as cxn: values = pd.read_sql(sql, cxn).set_index(index) labels = {} for trct in values.ADDRESS_TRCT.unique(): name = "TRCT_{}".format(trct) feature = values[values["ADDRESS_TRCT"] == trct] feature[name] = 1 features = features.join(feature[name]) labels[name] = "lived in Census tract {} during lookback period".format(trct) features = features.fillna(0) SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
def main(): """ """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(citations) AS citations, SUM(fines) AS citation_fines FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {citations} c ON pop.riipl_id = c.riipl_id AND c.yrmo = lb.yrmo GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "CITATIONS": "Number of police citations", "CITATION_FINES": "Total fines from police citations" } SaveFeatures(features, out, manifest, population, labels)
def main(): """ Indicators for commited, released, or charged from DOC events """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, MAX(CASE WHEN de.event_type = 'Committed' THEN 1 ELSE 0 END) AS doc_commited, MAX(CASE WHEN de.event_type = 'Released' THEN 1 ELSE 0 END) AS doc_released, MAX(CASE WHEN de.event_type = 'Charged' THEN 1 ELSE 0 END) AS doc_charged FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo LEFT JOIN {doc_events} de ON pop.riipl_id = de.riipl_id AND dd.date_dt = de.event_date GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) labels = { "DOC_COMMITED" : "Committed to a corrections facility", "DOC_RELEASED" : "Released from a corrections facility", "DOC_CHARGED" : "Charged with a crime" } SaveFeatures(features, out, manifest, population, labels)
def main(): index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) # Claims payments sql = """ SELECT pop.riipl_id, SUM(mc.pay_amt) AS medicaid_payments, SUM(CASE WHEN mc.src_id IN (4, 8, 16, 17, 26, 27) THEN mc.pay_amt ELSE 0 END) AS medicaid_prof_payments, SUM(CASE WHEN mc.ed_flag = 1 THEN mc.pay_amt ELSE 0 END) AS medicaid_ed_payments FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_claims} mc ON pop.riipl_id = mc.riipl_id AND dd.date_dt = mc.claim_dt GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Pharmacy payments sql = """ SELECT pop.riipl_id, SUM(mp.pay_amt) AS medicaid_rx_payments FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_pharmacy} mp ON pop.riipl_id = mp.riipl_id AND dd.date_dt = mp.dispensed_dt GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Payments are comprehensive, so a missing value means $0 features = features.fillna(0) labels = { "MEDICAID_PAYMENTS" : "Total Medicaid payments", "MEDICAID_PROF_PAYMENTS" : "Total Medicaid payments to professionals", "MEDICAID_ED_PAYMENTS" : "Total Medicaid payments to Emergency Departments", "MEDICAID_RX_PAYMENTS" : "Total Medicaid pharmacy payments" } SaveFeatures(features, out, manifest, population, labels)
def main(): """ Nationwide unemployment rate by NAICS code at a yearly level from Bureau of Labor and Statistics """ index = ["RIIPL_ID"] unemp = pd.read_csv(unemp_file) unemp["NAICS"] = unemp.NAICS.astype(str) # Calculate average unemployment by year unemp_avg = unemp.groupby("YR").mean().reset_index() print(unemp_avg) # Lookup all wage records in the lookback period, and order by # wages, to select the NAICS for the highest wage. sql = """ SELECT DISTINCT pop.riipl_id, TO_CHAR(pop.initial_dt, 'YYYY') AS yr, dw.wages, SUBSTR(dw.naics4, 1, 2) AS naics FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dlt_wage} dw ON pop.riipl_id = dw.riipl_id AND lb.yyq = dw.yyq ORDER BY pop.riipl_id, dw.wages """.format(**globals()) with Connection() as cxn: features = pd.read_sql(sql, cxn._connection).drop_duplicates( index, keep="last").set_index(index) # Use the previous year for joining the unemployment rate features["YR"] = features.YR.astype(int) - 1 # Join NAICS unemployment, using avg unemployment when no NAICS is available naics = features.merge(unemp, how="left", on=["YR", "NAICS"]).NAICS_UNEMP_RATE avg = features.merge(unemp_avg, how="left", on="YR").NAICS_UNEMP_RATE features["UNEMP_NAICS"] = np.where(naics.notnull(), naics, avg) labels = { "UNEMP_NAICS": "unemployment rate by NAICS code at national level in year before index date" } SaveFeatures(features[["UNEMP_NAICS"]], out, manifest, population, labels)
def main(): icd9 = pd.concat([ pd.read_csv(icd9_file1, usecols=[0, 1], names=["DIAG_CDE", "DESC"]), pd.read_csv(icd9_file2) ]).drop_duplicates("DIAG_CDE").set_index("DIAG_CDE") index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, mdc.diag_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_diag_cde} mdc ON pop.riipl_id = mdc.riipl_id AND dd.date_dt = mdc.claim_dt INNER JOIN {diag_corr} dc ON mdc.diag_cde = dc.diag_cde AND ABS(dc.corr) > 0 GROUP BY pop.riipl_id, mdc.diag_cde """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) cdes = values.DIAG_CDE.unique() labels = {} for cde in cdes: feature = values.loc[values.DIAG_CDE == cde, "N"] feature.name = "DIAG_{}".format(cde) features = features.join(feature) try: desc = icd9.loc[cde, "DESC"] except KeyError: desc = "unknown description" label = "number of diagnoses for '{}' during lookback period".format( desc) labels[feature.name] = label features = features.fillna(0) SaveFeatures(features, out, manifest, population, labels)
def main(): procs = pd.read_csv(procsfile, sep="|", index_col="PROC_CDE") index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, mpc.proc_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_proc_cde} mpc ON pop.riipl_id = mpc.riipl_id AND dd.date_dt = mpc.claim_dt INNER JOIN {proc_corr} pc ON mpc.proc_cde = pc.proc_cde AND ABS(pc.corr) > 0 GROUP BY pop.riipl_id, mpc.proc_cde """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) cdes = values.PROC_CDE.unique() labels = {} for cde in cdes: feature = values.loc[values.PROC_CDE == cde, "N"] feature.name = "PROC_{}".format(cde) features = features.join(feature) try: desc = procs.loc[cde, "DESCRIPTION"] except KeyError: desc = "unknown procedure" label = "number of procedures for '{}' during lookback period".format( desc) labels[feature.name] = label features = features.fillna(0) SaveFeatures(features, out, manifest, population, labels)
def main(): """ Opioid prescriptions and adverse outcomes among DHS household """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(ABS(no.opioid)) AS dhs_hh_opioid_rx, SUM(no.recovery) AS dhs_hh_recovery_rx, COUNT(o.outcome_any_dt) AS dhs_hh_outcomes FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo LEFT JOIN {household} hh ON pop.riipl_id = hh.riipl_id LEFT JOIN {medicaid_pharmacy} mp ON hh.riipl_id_hh = mp.riipl_id AND dd.date_dt = mp.dispensed_dt LEFT JOIN {ndc_opioids} no ON mp.ndc9_code = no.ndc LEFT JOIN {outcomes} o ON hh.riipl_id_hh = o.riipl_id AND dd.date_dt = o.outcome_any_dt GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "DHS_HH_OPIOID_RX": "Number of opioid prescriptions in household", "DHS_HH_RECOVERY_RX": "Number of recovery prescriptions in household", "DHS_HH_OUTCOMES": "Number of adverse outcomes in household" } SaveFeatures(features, out, manifest, population, labels)
def main(): index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, MAX(CASE WHEN rd.gender = 'M' THEN 1 ELSE 0 END) AS sex_m, MAX(CASE WHEN rd.primary_lang_cd = '01' THEN 1 ELSE 0 END) AS lang_spanish, MAX(CASE WHEN rd.primary_lang_cd = '31' THEN 1 ELSE 0 END) AS lang_portu, MAX(CASE WHEN rd.primary_lang_cd <> '00' AND rd.primary_lang_cd <> '01' AND rd.primary_lang_cd <> '31' THEN 1 ELSE 0 END) AS lang_other FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {medicaid_enrollment} me ON pop.riipl_id = me.riipl_id AND lb.yrmo = me.yrmo LEFT JOIN {recip_demo} rd ON me.re_unique_id = rd.recipient_id GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) features = features.fillna(0) labels = { "SEX_M": "Sex is male", "LANG_SPANISH": "Primary language is Spanish", "LANG_PORTU": "Primary language is Portuguese", "LANG_OTHER": "Primary language is not English, Spanish, or Portuguese", } SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
def main(): """ Social program payments from MEGA table """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(m.snap_payments) AS snap_payments, SUM(m.tanf_payments) AS tanf_payments, SUM(m.tdi_payments) AS tdi_payments, SUM(m.ui_payments) AS ui_payments, SUM(m.ssi_supplement) AS ssi_supplement, SUM(m.gpa_payments) AS gpa_payments, SUM(m.ccap_payments) AS ccap_payments FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {mega} m ON pop.riipl_id = m.riipl_id AND lb.yrmo = m.month GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Payments are comprehensive, so a missing value means $0 features = features.fillna(0) labels = { "SNAP_PAYMENTS" : "Total Food Stamps payments", "TANF_PAYMENTS" : "Total TANF payments", "TDI_PAYMENTS" : "Total TDI payments", "UI_PAYMENTS" : "Total Unemployment Insurance payments", "SSI_SUPPLEMENT" : "Total SSI supplement", "GPA_PAYMENTS" : "Total GPA payments", "CCAP_PAYMENTS" : "Total CCAP payments" } SaveFeatures(features, out, manifest, population, labels)
def main(): index = ["RIIPL_ID"] sql = """ SELECT pop.riipl_id, pop.injection FROM {population} pop """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = pd.read_sql(sql, cxn, index_col=index) labels = {"INJECTION": "Initial exposure was an opioid injection"} SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
def main(): """ Size of DHS household and indicator for births in the DHS household from MEGA table """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, COUNT(DISTINCT m2.riipl_id) AS dhs_hh_size, MAX(CASE WHEN m1.age > 1 AND m2.age < 1 THEN 1 ELSE 0 END) AS dhs_hh_birth FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {mega} m1 ON pop.riipl_id = m1.riipl_id AND lb.yrmo = m1.month INNER JOIN {mega} m2 ON m1.dhs_hh_id = m2.dhs_hh_id AND m1.month = m2.month GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join(pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "DHS_HH_SIZE" : "Size of household", "DHS_HH_BIRTH" : "New birth in household" } SaveFeatures(features, out, manifest, population, labels, bool_features=["DHS_HH_BIRTH"])
def main(): """ Demographics variables from MEGA table """ sql = """ SELECT pop.riipl_id, CASE WHEN MAX(m.age) >= 18 AND MAX(m.age) < 24 THEN 1 ELSE 0 END AS age_1, CASE WHEN MAX(m.age) >= 24 AND MAX(m.age) < 30 THEN 1 ELSE 0 END AS age_2, CASE WHEN MAX(m.age) >= 30 AND MAX(m.age) < 40 THEN 1 ELSE 0 END AS age_3, CASE WHEN MAX(m.age) >= 40 AND MAX(m.age) < 55 THEN 1 ELSE 0 END AS age_4, CASE WHEN MAX(m.age) >= 55 AND MAX(m.age) < 65 THEN 1 ELSE 0 END AS age_5, CASE WHEN MAX(m.age) >= 65 THEN 1 ELSE 0 END AS age_6, CASE WHEN AVG(m.bmi) < 18.5 THEN 1 ELSE 0 END AS bmi_under, CASE WHEN AVG(m.bmi) >= 25 AND AVG(m.bmi) < 30 THEN 1 ELSE 0 END AS bmi_over, CASE WHEN AVG(m.bmi) >= 30 THEN 1 ELSE 0 END AS bmi_obese, CASE WHEN COUNT(m.bmi) = 0 THEN 1 ELSE 0 END AS bmi_missing, MAX(CASE WHEN(m.marital_status) = 1 THEN 1 ELSE 0 END) AS married, CASE WHEN COUNT(m.marital_status) = 0 THEN 1 ELSE 0 END AS married_missing, MAX(CASE WHEN m.race = 'Black' THEN 1 ELSE 0 END) AS race_black, MAX(CASE WHEN m.race = 'Hispanic' THEN 1 ELSE 0 END) AS race_hispanic, MAX(CASE WHEN m.race = 'Asian' THEN 1 WHEN m.race = 'Native American' THEN 1 WHEN m.race = 'Other' THEN 1 ELSE 0 END) AS race_other, CASE WHEN COUNT(m.race) = 0 THEN 1 ELSE 0 END AS race_missing FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {mega} m ON pop.riipl_id = m.riipl_id AND lb.yrmo = m.month GROUP BY pop.riipl_id ORDER BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = pd.read_sql(sql, cxn._connection, index_col="RIIPL_ID") labels = { "AGE_1": "Age is 18-23", "AGE_2": "Age is 24-29", "AGE_3": "Age is 30-39", "AGE_4": "Age is 40-54", "AGE_5": "Age is 55-64", "AGE_6": "Age is 65+", "BMI_UNDER": "Body mass index is underweight", "BMI_OVER": "Body mass index is overweight", "BMI_OBESE": "Body mass index is obese", "BMI_MISSING": "Body mass index is missing", "MARRIED": "Married", "MARRIED_MISSING": "Marital status is missing", "RACE_BLACK": "Race is African American", "RACE_HISPANIC": "Ethnicity is Hispanic", "RACE_OTHER": "Race is Asian, Native American, or other", "RACE_MISSING": "Race is missing" } SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
def main(): index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, COUNT(DISTINCT me.re_unique_id) AS medicaid_n_ids, MAX(CASE WHEN UPPER(dim.description) LIKE '%PREG%' THEN 1 ELSE 0 END) AS medicaid_pregnant, MAX(CASE WHEN UPPER(dim.description) LIKE '%PARTUM%' THEN 1 ELSE 0 END) AS medicaid_postpartum, MAX(CASE WHEN UPPER(dim.description) LIKE '%BLIND%' THEN 1 ELSE 0 END) AS medicaid_blind, MAX(CASE WHEN UPPER(dim.description) LIKE '%DISABLED%' THEN 1 ELSE 0 END) AS medicaid_disabled, MAX(CASE WHEN UPPER(dim.description) LIKE '%ALIEN%' THEN 1 ELSE 0 END) AS medicaid_alien, MAX(CASE WHEN UPPER(dim.description) LIKE '%CHILD%' THEN 1 ELSE 0 END) AS medicaid_child, MAX(CASE WHEN dim.needy_ind = 'Medically' THEN 1 ELSE 0 END) AS medicaid_med_needy, MAX(NVL(me.managed_care, 0)) AS medicaid_managed_care, MAX(NVL(me.premium_payment, 0)) AS medicaid_prem_payment, MAX(CASE WHEN rxn.payer_cd = 'B' THEN 1 ELSE 0 END) AS medicaid_payer_bhddh, MAX(CASE WHEN rxn.payer_cd = 'C' THEN 1 ELSE 0 END) AS medicaid_payer_cmap, MAX(CASE WHEN rxn.payer_cd = 'D' THEN 1 ELSE 0 END) AS medicaid_payer_doc, MAX(CASE WHEN rxn.payer_cd = 'H' THEN 1 ELSE 0 END) AS medicaid_payer_aids, MAX(CASE WHEN rxn.payer_cd = 'O' THEN 1 ELSE 0 END) AS medicaid_payer_ors, MAX(CASE WHEN rxn.payer_cd = 'R' THEN 1 ELSE 0 END) AS medicaid_payer_ripae FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {medicaid_enrollment} me ON pop.riipl_id = me.riipl_id AND lb.yrmo = me.yrmo LEFT JOIN {dim_aid_ctg_cde} dim ON me.aid_category = dim.aid_ctg_cde LEFT JOIN {recip_x_ssn} rxn ON me.re_unique_id = rxn.recipient_id GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) features = features.fillna(0) labels = { "MEDICAID_N_IDS": "Number of unique Medicaid IDs", "MEDICAID_PREGNANT": "Medicaid eligibility category contains the word 'pregnant'", "MEDICAID_POSTPARTUM": "Medicaid eligibility category contains the word 'postpartum'", "MEDICAID_BLIND": "Medicaid eligibility category contains the word 'blind'", "MEDICAID_DISABLED": "Medicaid eligibility category contains the word 'disabled'", "MEDICAID_ALIEN": "Medicaid eligibility category contains the word 'alien'", "MEDICAID_CHILD": "Medicaid eligibility category contains the word 'child'", "MEDICAID_MED_NEEDY": "Eligible for Medicaid as medically needy", "MEDICAID_MANAGED_CARE": "Enrolled in Medicaid managed care", "MEDICAID_PREM_PAYMENT": "Enrolled in Medicaid premium payment plan", "MEDICAID_PAYER_BHDDH": "Medicaid payer code is BHDDH", "MEDICAID_PAYER_CMAP": "Enrolled in RI Community Medication Assistance Program", "MEDICAID_PAYER_DOC": "Medicaid payer code is Department of Corrections", "MEDICAID_PAYER_AIDS": "Medicaid payer code is Ryan White AIDS Drug Assistance Program", "MEDICAID_PAYER_ORS": "Medicaid payer code is ORS", "MEDICAID_PAYER_RIPAE": "Enrolled in RI Pharmaceutical Assistance to the Elderly" } SaveFeatures(features, out, manifest, population, labels, bool_features=[f for f in labels if f != "MEDICAID_N_IDS"])
def main(): index = ["RIIPL_ID"] pop = CachePopulationSubsets(population, index) pop["ROW_ID"] = np.arange(len(pop)) outcomes = pd.read_csv(outcomes_file) words = pd.read_csv(words_file, index_col="WORD_ID") # Load counts and convert to CSR sparse matrix for efficient row slicing counts = mmread(counts_file).tocsr() # Further divide training data into training and validation sets for # selecting the optimal number of topics training = (pop["SUBSET"] == "TRAINING") np.random.seed(seed) subset = np.random.choice([True, False], len(training), p=[0.25, 0.75]) validation = (training & subset) training = (training & ~subset) print(training.sum(), "training") print(validation.sum(), "validation") # Create training and validation outcomes y_train = outcomes.loc[training, "OUTCOME_ANY"].values y_validate = outcomes.loc[validation, "OUTCOME_ANY"].values print(y_train.sum(), "training outcomes") print(y_validate.sum(), "validation outcomes") # Transform raw counts to TF-IDF using IDF from the training set training = np.where(training)[0] validation = np.where(validation)[0] counts_train = counts[training, :] tfidf = TfidfTransformer() tfidf.fit(counts_train) counts = tfidf.transform(counts) counts_train = counts[training, :] counts_validate = counts[validation, :] # Select NMF model with best AUC performance on validation data best = 0 best_auc = 0 nmfs = [] for i, n in enumerate(ntopics): print(n, "topics:") nmf = NMF(n, random_state=seed).fit(counts_train) nmfs.append(nmf) X_train = pd.DataFrame(nmf.transform(counts_train)) X_train["intercept"] = 1 logit = Logit(y_train, X_train).fit(maxiter=1000, method="cg") print(logit.summary()) X_validate = pd.DataFrame(nmf.transform(counts_validate)) X_validate["intercept"] = 1 y_pred = logit.predict(X_validate) auc = roc_auc_score(y_validate, y_pred) print("AUC:", auc) if (auc - best_auc) > delta: best = i best_auc = auc else: break print("selected", ntopics[best], "topics") # Turn best NMF topics into features features = pd.DataFrame(nmfs[best].transform(counts)) features.columns = [ "MEDICAID_TOPIC_{}".format(i) for i in range(ntopics[best]) ] features["RIIPL_ID"] = pop["RIIPL_ID"] features = features.set_index("RIIPL_ID") # Use the top 10 words in a topic as its description top10words = [ " ".join(words.loc[i, "WORD"] for i in topic.argsort()[-11:-1]) for topic in nmfs[best].components_ ] descs = [ "Topic {} ({})".format(i, words) for i, words in enumerate(top10words) ] labels = dict(zip(features.columns, descs)) SaveFeatures(features, out, manifest, population, labels)
def main(): """ Industry worked in (NAICS code) from DLT wages """ sql = """ SELECT DISTINCT SUBSTR(naics4, 0, 2) AS naics2 FROM {dlt_wage} ORDER BY naics2 """.format(**globals()) with cx_Oracle.connect("/") as cxn: naics = pd.read_sql(sql, cxn) pivot = ",".join(["'{0}' AS naics_{0}".format(n) for n in naics["NAICS2"]]) index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT * FROM ( SELECT pop.riipl_id, SUBSTR(dw.naics4, 0, 2) AS naics2, 1 AS worked FROM {population} pop LEFT JOIN ( SELECT DISTINCT riipl_id, yyq FROM {lookback} ) lb ON pop.riipl_id = lb.riipl_id INNER JOIN {dlt_wage} dw ON pop.riipl_id = dw.riipl_id AND lb.yyq = dw.yyq GROUP BY pop.riipl_id, SUBSTR(dw.naics4, 0, 2) ) PIVOT (MAX(worked) FOR naics2 IN ({pivot})) """.format(pivot=pivot, **globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Drop NAICS categories that are missing for the entire population. features = features.dropna(axis=1, how="all").fillna(0).astype(int) # Missing indicator not needed - provided by WAGES_MISSING labels = {} naics2_labels = { "00": "Unknown", "02": "Unknown", "11": "Agriculture, Forestry, Fishing, and Hunting", "21": "Mining, Quarrying, and Oil and Gas Extraction", "22": "Utilities", "23": "Construction", "24": "Unknown", "31": "Manufacturing", "32": "Manufacturing", "33": "Manufacturing", "42": "Wholesale Trade", "44": "Retail Trade", "45": "Retail Trade", "48": "Transportation and Warehousing", "49": "Transportation and Warehousing", "50": "Unknown", "51": "Information", "52": "Finance and Insurance", "53": "Real Estate and Rental and Leasing", "54": "Professional, Scientific, and Technical Services", "55": "Management of Companies and Enterprises", "56": "Administrative and Support and Waste Management and Remediation Services", "61": "Educational Services", "62": "Health Care and Social Assistance", "71": "Arts, Entertainment, and Recreation", "72": "Accommodation and Food Services", "81": "Other Services (except Public Administration)", "84": "Unknown", "92": "Public Administration", "94": "Unknown", "99": "Unknown" } for n, sector in naics2_labels.items(): label = "worked in {} sector (NAICS {}) during lookback period".format( sector, n) labels["NAICS_{}".format(n)] = label SaveFeatures(features, out, manifest, (population, index), labels, bool_features=list(labels))
def main(): """ This code generates the following features based on someone's home census block group: * Median income * Share below the federal povery line """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) income = pd.read_csv( income_file, skiprows=1, na_values=["-", "**"], usecols=[ "Id2", "Estimate; Median household income in the past 12 months (in 2015 Inflation-adjusted dollars)" ]) income.columns = ["GEO_ID", "BLKGRP_MEDIANINCOME"] fpl = pd.read_csv( fpl_file, skiprows=1, na_values=["-", "**"], usecols=[ "Id2", "Estimate; Total:", "Estimate; Income in the past 12 months below poverty level:" ]) fpl.columns = ["GEO_ID", "TOTAL", "FPL"] fpl["BLKGRP_BELOWFPL"] = fpl.FPL / fpl.TOTAL fpl = fpl[["GEO_ID", "BLKGRP_BELOWFPL"]] sql = """ SELECT pop.riipl_id, STATS_MODE(TO_NUMBER(a.state || a.county || a.trct || a.blkgrp)) AS geo_id, 1 AS geocoded FROM ( SELECT pop.riipl_id, MAX(a.obs_date) AS max_dt FROM {population} pop INNER JOIN {address} a ON pop.riipl_id = a.riipl_id AND a.obs_date <= pop.initial_dt GROUP BY pop.riipl_id ) pop INNER JOIN {address} a ON pop.riipl_id = a.riipl_id AND pop.max_dt = a.obs_date GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) values = values.merge(income, how="left", on="GEO_ID")\ .merge(fpl, how="left", on="GEO_ID") features = features.join(values.set_index(index)) del features["GEO_ID"] # Mean imputation features["GEOCODED"] = features.GEOCODED.fillna(0) features["BLKGRP_MEDIANINCOME"] = features.BLKGRP_MEDIANINCOME.fillna( features.BLKGRP_MEDIANINCOME.mean()) features["BLKGRP_BELOWFPL"] = features.BLKGRP_BELOWFPL.fillna( features.BLKGRP_BELOWFPL.mean()) labels = { "GEOCODED": "Has a geocoded home address", "BLKGRP_MEDIANINCOME": "Block group's median annual household income (2015 dollars)", "BLKGRP_BELOWFPL": "Share of block group below poverty line" } SaveFeatures(features, out, manifest, population, labels)
def main(): ccs = pd.read_csv(ccs_file, usecols=[0, 1], quotechar="'", skiprows=3, names=["DIAG_CDE", "DISEASE"]) ccs["DIAG_CDE"] = ccs.DIAG_CDE.str.strip() cci = pd.read_csv(cci_file, usecols=[0, 2], quotechar="'", skiprows=2, names=["DIAG_CDE", "CHRONIC"]) cci = cci[cci.CHRONIC == 1] cci["DIAG_CDE"] = cci.DIAG_CDE.str.strip() index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT DISTINCT pop.riipl_id, d.diag_cde FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {diag_cde} d ON pop.riipl_id = d.riipl_id AND dd.date_dt = d.claim_dt """.format(**globals()) with Connection() as cxn: diags = pd.read_sql(sql, cxn._connection) ndisease = diags.merge(ccs, how="inner", on="DIAG_CDE")[index+["DISEASE"]]\ .groupby(index)\ .agg({"DISEASE": pd.Series.nunique})\ .rename(columns={"DISEASE": "MEDICAID_DISEASE_SCORE"}) features = features.join(ndisease) nchronic = diags.merge(cci, how="inner", on="DIAG_CDE")[index+["CHRONIC"]]\ .groupby(index)\ .sum()\ .rename(columns={"CHRONIC": "MEDICAID_CHRONIC_SCORE"}) features = features.join(nchronic) sql = """ SELECT pop.riipl_id, COUNT(DISTINCT p.proc_cde) AS medicaid_procedures FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {proc_cde} p ON pop.riipl_id = p.riipl_id AND dd.date_dt = p.claim_dt GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: nproc = pd.read_sql(sql, cxn._connection, index_col=index) features = features.join(nproc).fillna(0) # Add squared terms for x in [ "MEDICAID_DISEASE_SCORE", "MEDICAID_CHRONIC_SCORE", "MEDICAID_PROCEDURES" ]: features["{}_SQ".format(x)] = features[x] * features[x] labels = { "MEDICAID_DISEASE_SCORE": "Number of AHRQ CCS diseases", "MEDICAID_CHRONIC_SCORE": "Number of AHRQ CCI chronic conditions", "MEDICAID_PROCEDURES": "Number of distinct procedures", "MEDICAID_DISEASE_SCORE_SQ": "Squared number of AHRQ CCS diseases", "MEDICAID_CHRONIC_SCORE_SQ": "Squared number of AHRQ CCI chronic conditions", "MEDICAID_PROCEDURES_SQ": "Squared number of distinct procedures" } SaveFeatures(features, out, manifest, population, labels)
index = ["RIIPL_ID"] # Load features features = reduce(lambda x, y: x.join(y), [ pd.read_csv(feature_file, index_col=index) for feature_file in feature_files ]) # Load manifest manifest = pd.read_csv(manifest_file, sep="\t", names=["var", "desc"], index_col="var")["desc"].to_dict() # Create pairwise interactions interactions = pd.DataFrame(index=features.index) labels = {} for var1 in set1: for var2 in set2: varx = "{}_X_{}".format(var1, var2) interactions[varx] = features[var1] * features[var2] labels[varx] = "'{}' X '{}'".format(manifest[var1], manifest[var2]) SaveFeatures(interactions, out, out_manifest, population, labels) # vim: expandtab sw=4 ts=4