def main(): """ Indicators for commited, released, or charged from DOC events """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, MAX(CASE WHEN de.event_type = 'Committed' THEN 1 ELSE 0 END) AS doc_commited, MAX(CASE WHEN de.event_type = 'Released' THEN 1 ELSE 0 END) AS doc_released, MAX(CASE WHEN de.event_type = 'Charged' THEN 1 ELSE 0 END) AS doc_charged FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo LEFT JOIN {doc_events} de ON pop.riipl_id = de.riipl_id AND dd.date_dt = de.event_date GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) labels = { "DOC_COMMITED" : "Committed to a corrections facility", "DOC_RELEASED" : "Released from a corrections facility", "DOC_CHARGED" : "Charged with a crime" } SaveFeatures(features, out, manifest, population, labels)
def main(): """ Indicators for DOC sentences by crime type """ sql = """ SELECT DISTINCT offense_type AS offense FROM {doc_sentences} WHERE offense_type IS NOT NULL ORDER BY offense """.format(**globals()) with cx_Oracle.connect("/") as cxn: offenses = pd.read_sql(sql, cxn) print(offenses) pivot = ",".join([ "'{0}' AS doc_sentenced_{0}".format(offense) for offense in offenses["OFFENSE"] ]) index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT * FROM ( SELECT pop.riipl_id, ds.offense_type AS offense, 1 AS sentenced FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {doc_sentences} ds ON pop.riipl_id = ds.riipl_id AND dd.date_dt = ds.imposed_date GROUP BY pop.riipl_id, ds.offense_type ) PIVOT (MAX(sentenced) FOR offense IN ({pivot})) """.format(pivot=pivot, **globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Drop offense types that are missing for the entire population. features = features.dropna(axis=1, how="all").fillna(0).astype(int) labels = {} for offense in offenses["OFFENSE"]: label = "sentenced for {} during lookback period".format(offense) labels["DOC_SENTENCED_{}".format(offense.upper())] = label SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
def main(): """ Indicator for UI exhaustion """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT DISTINCT pop.riipl_id, 1 AS ui_exhaustion FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {dlt_ui_payments} du ON pop.riipl_id = du.riipl_id AND dd.date_dt = du.trans_date WHERE void_check_flag IS NULL AND delete_flag IS NULL AND opening_credits > 0 AND closing_credits <= 0 """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection, index_col=index)).fillna(0) labels = {"UI_EXHAUSTION": "Exhausted UI benefits"} SaveFeatures(features, out, manifest, population, labels)
def main(): """ Home addresses from MEGA table """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT DISTINCT pop.riipl_id, address_trct FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {mega} m ON pop.riipl_id = m.riipl_id AND lb.yrmo = m.month WHERE m.address_trct IS NOT NULL """.format(**globals()) with cx_Oracle.connect("/") as cxn: values = pd.read_sql(sql, cxn).set_index(index) labels = {} for trct in values.ADDRESS_TRCT.unique(): name = "TRCT_{}".format(trct) feature = values[values["ADDRESS_TRCT"] == trct] feature[name] = 1 features = features.join(feature[name]) labels[name] = "lived in Census tract {} during lookback period".format(trct) features = features.fillna(0) SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
def main(): """ """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(cc.car_crash) AS car_crashes, SUM(cc.injured) AS car_crash_injuries FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {car_crashes} cc ON pop.riipl_id = cc.riipl_id AND cc.yrmo = lb.yrmo GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "CAR_CRASHES": "count of car crashes involved in during lookback period", "CAR_CRASH_INJURIES": "count of car crash injuries during lookback period", } SaveFeatures(features, out, manifest, population, labels)
def main(): """ """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(citations) AS citations, SUM(fines) AS citation_fines FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {citations} c ON pop.riipl_id = c.riipl_id AND c.yrmo = lb.yrmo GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "CITATIONS": "Number of police citations", "CITATION_FINES": "Total fines from police citations" } SaveFeatures(features, out, manifest, population, labels)
def main(): index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) # Claims payments sql = """ SELECT pop.riipl_id, SUM(mc.pay_amt) AS medicaid_payments, SUM(CASE WHEN mc.src_id IN (4, 8, 16, 17, 26, 27) THEN mc.pay_amt ELSE 0 END) AS medicaid_prof_payments, SUM(CASE WHEN mc.ed_flag = 1 THEN mc.pay_amt ELSE 0 END) AS medicaid_ed_payments FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_claims} mc ON pop.riipl_id = mc.riipl_id AND dd.date_dt = mc.claim_dt GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Pharmacy payments sql = """ SELECT pop.riipl_id, SUM(mp.pay_amt) AS medicaid_rx_payments FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_pharmacy} mp ON pop.riipl_id = mp.riipl_id AND dd.date_dt = mp.dispensed_dt GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Payments are comprehensive, so a missing value means $0 features = features.fillna(0) labels = { "MEDICAID_PAYMENTS" : "Total Medicaid payments", "MEDICAID_PROF_PAYMENTS" : "Total Medicaid payments to professionals", "MEDICAID_ED_PAYMENTS" : "Total Medicaid payments to Emergency Departments", "MEDICAID_RX_PAYMENTS" : "Total Medicaid pharmacy payments" } SaveFeatures(features, out, manifest, population, labels)
def main(): icd9 = pd.concat([ pd.read_csv(icd9_file1, usecols=[0, 1], names=["DIAG_CDE", "DESC"]), pd.read_csv(icd9_file2) ]).drop_duplicates("DIAG_CDE").set_index("DIAG_CDE") index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, mdc.diag_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_diag_cde} mdc ON pop.riipl_id = mdc.riipl_id AND dd.date_dt = mdc.claim_dt INNER JOIN {diag_corr} dc ON mdc.diag_cde = dc.diag_cde AND ABS(dc.corr) > 0 GROUP BY pop.riipl_id, mdc.diag_cde """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) cdes = values.DIAG_CDE.unique() labels = {} for cde in cdes: feature = values.loc[values.DIAG_CDE == cde, "N"] feature.name = "DIAG_{}".format(cde) features = features.join(feature) try: desc = icd9.loc[cde, "DESC"] except KeyError: desc = "unknown description" label = "number of diagnoses for '{}' during lookback period".format( desc) labels[feature.name] = label features = features.fillna(0) SaveFeatures(features, out, manifest, population, labels)
def main(): procs = pd.read_csv(procsfile, sep="|", index_col="PROC_CDE") index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, mpc.proc_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_proc_cde} mpc ON pop.riipl_id = mpc.riipl_id AND dd.date_dt = mpc.claim_dt INNER JOIN {proc_corr} pc ON mpc.proc_cde = pc.proc_cde AND ABS(pc.corr) > 0 GROUP BY pop.riipl_id, mpc.proc_cde """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) cdes = values.PROC_CDE.unique() labels = {} for cde in cdes: feature = values.loc[values.PROC_CDE == cde, "N"] feature.name = "PROC_{}".format(cde) features = features.join(feature) try: desc = procs.loc[cde, "DESCRIPTION"] except KeyError: desc = "unknown procedure" label = "number of procedures for '{}' during lookback period".format( desc) labels[feature.name] = label features = features.fillna(0) SaveFeatures(features, out, manifest, population, labels)
def main(): """ Opioid prescriptions and adverse outcomes among DHS household """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(ABS(no.opioid)) AS dhs_hh_opioid_rx, SUM(no.recovery) AS dhs_hh_recovery_rx, COUNT(o.outcome_any_dt) AS dhs_hh_outcomes FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo LEFT JOIN {household} hh ON pop.riipl_id = hh.riipl_id LEFT JOIN {medicaid_pharmacy} mp ON hh.riipl_id_hh = mp.riipl_id AND dd.date_dt = mp.dispensed_dt LEFT JOIN {ndc_opioids} no ON mp.ndc9_code = no.ndc LEFT JOIN {outcomes} o ON hh.riipl_id_hh = o.riipl_id AND dd.date_dt = o.outcome_any_dt GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join( pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "DHS_HH_OPIOID_RX": "Number of opioid prescriptions in household", "DHS_HH_RECOVERY_RX": "Number of recovery prescriptions in household", "DHS_HH_OUTCOMES": "Number of adverse outcomes in household" } SaveFeatures(features, out, manifest, population, labels)
def main(): """ Social program payments from MEGA table """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, SUM(m.snap_payments) AS snap_payments, SUM(m.tanf_payments) AS tanf_payments, SUM(m.tdi_payments) AS tdi_payments, SUM(m.ui_payments) AS ui_payments, SUM(m.ssi_supplement) AS ssi_supplement, SUM(m.gpa_payments) AS gpa_payments, SUM(m.ccap_payments) AS ccap_payments FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {mega} m ON pop.riipl_id = m.riipl_id AND lb.yrmo = m.month GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Payments are comprehensive, so a missing value means $0 features = features.fillna(0) labels = { "SNAP_PAYMENTS" : "Total Food Stamps payments", "TANF_PAYMENTS" : "Total TANF payments", "TDI_PAYMENTS" : "Total TDI payments", "UI_PAYMENTS" : "Total Unemployment Insurance payments", "SSI_SUPPLEMENT" : "Total SSI supplement", "GPA_PAYMENTS" : "Total GPA payments", "CCAP_PAYMENTS" : "Total CCAP payments" } SaveFeatures(features, out, manifest, population, labels)
def main(): index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, MAX(CASE WHEN rd.gender = 'M' THEN 1 ELSE 0 END) AS sex_m, MAX(CASE WHEN rd.primary_lang_cd = '01' THEN 1 ELSE 0 END) AS lang_spanish, MAX(CASE WHEN rd.primary_lang_cd = '31' THEN 1 ELSE 0 END) AS lang_portu, MAX(CASE WHEN rd.primary_lang_cd <> '00' AND rd.primary_lang_cd <> '01' AND rd.primary_lang_cd <> '31' THEN 1 ELSE 0 END) AS lang_other FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {medicaid_enrollment} me ON pop.riipl_id = me.riipl_id AND lb.yrmo = me.yrmo LEFT JOIN {recip_demo} rd ON me.re_unique_id = rd.recipient_id GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) features = features.fillna(0) labels = { "SEX_M": "Sex is male", "LANG_SPANISH": "Primary language is Spanish", "LANG_PORTU": "Primary language is Portuguese", "LANG_OTHER": "Primary language is not English, Spanish, or Portuguese", } SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
def main(): """ Size of DHS household and indicator for births in the DHS household from MEGA table """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, COUNT(DISTINCT m2.riipl_id) AS dhs_hh_size, MAX(CASE WHEN m1.age > 1 AND m2.age < 1 THEN 1 ELSE 0 END) AS dhs_hh_birth FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id INNER JOIN {mega} m1 ON pop.riipl_id = m1.riipl_id AND lb.yrmo = m1.month INNER JOIN {mega} m2 ON m1.dhs_hh_id = m2.dhs_hh_id AND m1.month = m2.month GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: features = features.join(pd.read_sql(sql, cxn._connection).set_index(index)) features = features.fillna(0) labels = { "DHS_HH_SIZE" : "Size of household", "DHS_HH_BIRTH" : "New birth in household" } SaveFeatures(features, out, manifest, population, labels, bool_features=["DHS_HH_BIRTH"])
def main(): index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT pop.riipl_id, COUNT(DISTINCT me.re_unique_id) AS medicaid_n_ids, MAX(CASE WHEN UPPER(dim.description) LIKE '%PREG%' THEN 1 ELSE 0 END) AS medicaid_pregnant, MAX(CASE WHEN UPPER(dim.description) LIKE '%PARTUM%' THEN 1 ELSE 0 END) AS medicaid_postpartum, MAX(CASE WHEN UPPER(dim.description) LIKE '%BLIND%' THEN 1 ELSE 0 END) AS medicaid_blind, MAX(CASE WHEN UPPER(dim.description) LIKE '%DISABLED%' THEN 1 ELSE 0 END) AS medicaid_disabled, MAX(CASE WHEN UPPER(dim.description) LIKE '%ALIEN%' THEN 1 ELSE 0 END) AS medicaid_alien, MAX(CASE WHEN UPPER(dim.description) LIKE '%CHILD%' THEN 1 ELSE 0 END) AS medicaid_child, MAX(CASE WHEN dim.needy_ind = 'Medically' THEN 1 ELSE 0 END) AS medicaid_med_needy, MAX(NVL(me.managed_care, 0)) AS medicaid_managed_care, MAX(NVL(me.premium_payment, 0)) AS medicaid_prem_payment, MAX(CASE WHEN rxn.payer_cd = 'B' THEN 1 ELSE 0 END) AS medicaid_payer_bhddh, MAX(CASE WHEN rxn.payer_cd = 'C' THEN 1 ELSE 0 END) AS medicaid_payer_cmap, MAX(CASE WHEN rxn.payer_cd = 'D' THEN 1 ELSE 0 END) AS medicaid_payer_doc, MAX(CASE WHEN rxn.payer_cd = 'H' THEN 1 ELSE 0 END) AS medicaid_payer_aids, MAX(CASE WHEN rxn.payer_cd = 'O' THEN 1 ELSE 0 END) AS medicaid_payer_ors, MAX(CASE WHEN rxn.payer_cd = 'R' THEN 1 ELSE 0 END) AS medicaid_payer_ripae FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {medicaid_enrollment} me ON pop.riipl_id = me.riipl_id AND lb.yrmo = me.yrmo LEFT JOIN {dim_aid_ctg_cde} dim ON me.aid_category = dim.aid_ctg_cde LEFT JOIN {recip_x_ssn} rxn ON me.re_unique_id = rxn.recipient_id GROUP BY pop.riipl_id """.format(**globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) features = features.fillna(0) labels = { "MEDICAID_N_IDS": "Number of unique Medicaid IDs", "MEDICAID_PREGNANT": "Medicaid eligibility category contains the word 'pregnant'", "MEDICAID_POSTPARTUM": "Medicaid eligibility category contains the word 'postpartum'", "MEDICAID_BLIND": "Medicaid eligibility category contains the word 'blind'", "MEDICAID_DISABLED": "Medicaid eligibility category contains the word 'disabled'", "MEDICAID_ALIEN": "Medicaid eligibility category contains the word 'alien'", "MEDICAID_CHILD": "Medicaid eligibility category contains the word 'child'", "MEDICAID_MED_NEEDY": "Eligible for Medicaid as medically needy", "MEDICAID_MANAGED_CARE": "Enrolled in Medicaid managed care", "MEDICAID_PREM_PAYMENT": "Enrolled in Medicaid premium payment plan", "MEDICAID_PAYER_BHDDH": "Medicaid payer code is BHDDH", "MEDICAID_PAYER_CMAP": "Enrolled in RI Community Medication Assistance Program", "MEDICAID_PAYER_DOC": "Medicaid payer code is Department of Corrections", "MEDICAID_PAYER_AIDS": "Medicaid payer code is Ryan White AIDS Drug Assistance Program", "MEDICAID_PAYER_ORS": "Medicaid payer code is ORS", "MEDICAID_PAYER_RIPAE": "Enrolled in RI Pharmaceutical Assistance to the Elderly" } SaveFeatures(features, out, manifest, population, labels, bool_features=[f for f in labels if f != "MEDICAID_N_IDS"])
def main(): """ Industry worked in (NAICS code) from DLT wages """ sql = """ SELECT DISTINCT SUBSTR(naics4, 0, 2) AS naics2 FROM {dlt_wage} ORDER BY naics2 """.format(**globals()) with cx_Oracle.connect("/") as cxn: naics = pd.read_sql(sql, cxn) pivot = ",".join(["'{0}' AS naics_{0}".format(n) for n in naics["NAICS2"]]) index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT * FROM ( SELECT pop.riipl_id, SUBSTR(dw.naics4, 0, 2) AS naics2, 1 AS worked FROM {population} pop LEFT JOIN ( SELECT DISTINCT riipl_id, yyq FROM {lookback} ) lb ON pop.riipl_id = lb.riipl_id INNER JOIN {dlt_wage} dw ON pop.riipl_id = dw.riipl_id AND lb.yyq = dw.yyq GROUP BY pop.riipl_id, SUBSTR(dw.naics4, 0, 2) ) PIVOT (MAX(worked) FOR naics2 IN ({pivot})) """.format(pivot=pivot, **globals()) with cx_Oracle.connect("/") as cxn: features = features.join(pd.read_sql(sql, cxn).set_index(index)) # Drop NAICS categories that are missing for the entire population. features = features.dropna(axis=1, how="all").fillna(0).astype(int) # Missing indicator not needed - provided by WAGES_MISSING labels = {} naics2_labels = { "00": "Unknown", "02": "Unknown", "11": "Agriculture, Forestry, Fishing, and Hunting", "21": "Mining, Quarrying, and Oil and Gas Extraction", "22": "Utilities", "23": "Construction", "24": "Unknown", "31": "Manufacturing", "32": "Manufacturing", "33": "Manufacturing", "42": "Wholesale Trade", "44": "Retail Trade", "45": "Retail Trade", "48": "Transportation and Warehousing", "49": "Transportation and Warehousing", "50": "Unknown", "51": "Information", "52": "Finance and Insurance", "53": "Real Estate and Rental and Leasing", "54": "Professional, Scientific, and Technical Services", "55": "Management of Companies and Enterprises", "56": "Administrative and Support and Waste Management and Remediation Services", "61": "Educational Services", "62": "Health Care and Social Assistance", "71": "Arts, Entertainment, and Recreation", "72": "Accommodation and Food Services", "81": "Other Services (except Public Administration)", "84": "Unknown", "92": "Public Administration", "94": "Unknown", "99": "Unknown" } for n, sector in naics2_labels.items(): label = "worked in {} sector (NAICS {}) during lookback period".format( sector, n) labels["NAICS_{}".format(n)] = label SaveFeatures(features, out, manifest, (population, index), labels, bool_features=list(labels))
def main(): icd9 = pd.concat([pd.read_csv(icd9_file1).rename(columns={"DIAGNOSIS CODE": "DIAG_CDE", "LONG DESCRIPTION": "DESC"}), pd.read_csv(icd9_file2).rename(columns={"diag_cde": "DIAG_CDE", "description": "DESC"})], ignore_index=True).drop_duplicates("DIAG_CDE").set_index("DIAG_CDE") proc = pd.read_csv(proc_file, sep="|", index_col="PROC_CDE").rename(columns={"DESCRIPTION": "DESC"}) # Remove non-word characters and convert to lowercase icd9["DESC"] = icd9.DESC.str.lower().str.replace("[-/]", " ").str.replace("[^a-z ]", "") proc["DESC"] = proc.DESC.str.lower().str.replace("[-/]", " ").str.replace("[^a-z ]", "") # Split word lists into a row per word icd9 = pd.DataFrame(icd9.DESC.str.split().tolist(), index=icd9.index).stack() icd9 = icd9.reset_index()[["DIAG_CDE", 0]].rename(columns={0: "WORD"}) proc = pd.DataFrame(proc.DESC.str.split().tolist(), index=proc.index).stack() proc = proc.reset_index()[["PROC_CDE", 0]].rename(columns={0: "WORD"}) # Remove stopwords icd9 = icd9[~icd9.WORD.isin(stopwords)] proc = proc[~proc.WORD.isin(stopwords)] # Generate ids for words words = icd9.WORD.append(proc.WORD, ignore_index=True) words = words[words != ""] words = words.value_counts().reset_index().rename(columns={"index": "WORD", "WORD": "N"}) words["WORD_ID"] = np.arange(len(words)) words[["WORD_ID", "WORD", "N"]].to_csv(words_file, index=False) print("distinct words:", len(words)) icd9 = icd9.merge(words[["WORD", "WORD_ID"]], how="inner", on="WORD") proc = proc.merge(words[["WORD", "WORD_ID"]], how="inner", on="WORD") sql = """ SELECT pop.riipl_id, mc.diag_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_diag_cde} mc ON pop.riipl_id = mc.riipl_id AND dd.date_dt = mc.claim_dt GROUP BY pop.riipl_id, mc.diag_cde """.format(**globals()) with Connection() as cxn: diags = pd.read_sql(sql, cxn._connection) diags = diags.merge(icd9[["DIAG_CDE", "WORD_ID"]], how="inner", on="DIAG_CDE") sql = """ SELECT pop.riipl_id, mc.proc_cde, COUNT(*) AS n FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {medicaid_proc_cde} mc ON pop.riipl_id = mc.riipl_id AND dd.date_dt = mc.claim_dt GROUP BY pop.riipl_id, mc.proc_cde """.format(**globals()) with Connection() as cxn: procs = pd.read_sql(sql, cxn._connection) procs = procs.merge(proc[["PROC_CDE", "WORD_ID"]], how="inner", on="PROC_CDE") columns = ["RIIPL_ID", "WORD_ID", "N"] values = diags[columns].append(procs[columns], ignore_index=True) values = values.groupby(["RIIPL_ID", "WORD_ID"]).sum().reset_index() pop = CachePopulation(population, "RIIPL_ID") pop["ROW_ID"] = np.arange(len(pop)) values = values.merge(pop, on="RIIPL_ID")[["ROW_ID", "WORD_ID", "N"]] print("total occurrences:", values.N.sum()) # MM uses 1-based indexing values["ROW_ID"] += 1 values["WORD_ID"] += 1 with open(counts_file, "w") as f: print("%%MatrixMarket matrix coordinate integer general", file=f) print(len(pop), len(words), len(values), file=f) values.to_csv(counts_file, sep=" ", mode="a", header=False, index=False)
def main(): """ This code generates the following features based on someone's home census block group: * Median income * Share below the federal povery line """ index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) income = pd.read_csv( income_file, skiprows=1, na_values=["-", "**"], usecols=[ "Id2", "Estimate; Median household income in the past 12 months (in 2015 Inflation-adjusted dollars)" ]) income.columns = ["GEO_ID", "BLKGRP_MEDIANINCOME"] fpl = pd.read_csv( fpl_file, skiprows=1, na_values=["-", "**"], usecols=[ "Id2", "Estimate; Total:", "Estimate; Income in the past 12 months below poverty level:" ]) fpl.columns = ["GEO_ID", "TOTAL", "FPL"] fpl["BLKGRP_BELOWFPL"] = fpl.FPL / fpl.TOTAL fpl = fpl[["GEO_ID", "BLKGRP_BELOWFPL"]] sql = """ SELECT pop.riipl_id, STATS_MODE(TO_NUMBER(a.state || a.county || a.trct || a.blkgrp)) AS geo_id, 1 AS geocoded FROM ( SELECT pop.riipl_id, MAX(a.obs_date) AS max_dt FROM {population} pop INNER JOIN {address} a ON pop.riipl_id = a.riipl_id AND a.obs_date <= pop.initial_dt GROUP BY pop.riipl_id ) pop INNER JOIN {address} a ON pop.riipl_id = a.riipl_id AND pop.max_dt = a.obs_date GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: values = pd.read_sql(sql, cxn._connection) values = values.merge(income, how="left", on="GEO_ID")\ .merge(fpl, how="left", on="GEO_ID") features = features.join(values.set_index(index)) del features["GEO_ID"] # Mean imputation features["GEOCODED"] = features.GEOCODED.fillna(0) features["BLKGRP_MEDIANINCOME"] = features.BLKGRP_MEDIANINCOME.fillna( features.BLKGRP_MEDIANINCOME.mean()) features["BLKGRP_BELOWFPL"] = features.BLKGRP_BELOWFPL.fillna( features.BLKGRP_BELOWFPL.mean()) labels = { "GEOCODED": "Has a geocoded home address", "BLKGRP_MEDIANINCOME": "Block group's median annual household income (2015 dollars)", "BLKGRP_BELOWFPL": "Share of block group below poverty line" } SaveFeatures(features, out, manifest, population, labels)
def main(): ccs = pd.read_csv(ccs_file, usecols=[0, 1], quotechar="'", skiprows=3, names=["DIAG_CDE", "DISEASE"]) ccs["DIAG_CDE"] = ccs.DIAG_CDE.str.strip() cci = pd.read_csv(cci_file, usecols=[0, 2], quotechar="'", skiprows=2, names=["DIAG_CDE", "CHRONIC"]) cci = cci[cci.CHRONIC == 1] cci["DIAG_CDE"] = cci.DIAG_CDE.str.strip() index = ["RIIPL_ID"] features = CachePopulation(population, index).set_index(index) sql = """ SELECT DISTINCT pop.riipl_id, d.diag_cde FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {diag_cde} d ON pop.riipl_id = d.riipl_id AND dd.date_dt = d.claim_dt """.format(**globals()) with Connection() as cxn: diags = pd.read_sql(sql, cxn._connection) ndisease = diags.merge(ccs, how="inner", on="DIAG_CDE")[index+["DISEASE"]]\ .groupby(index)\ .agg({"DISEASE": pd.Series.nunique})\ .rename(columns={"DISEASE": "MEDICAID_DISEASE_SCORE"}) features = features.join(ndisease) nchronic = diags.merge(cci, how="inner", on="DIAG_CDE")[index+["CHRONIC"]]\ .groupby(index)\ .sum()\ .rename(columns={"CHRONIC": "MEDICAID_CHRONIC_SCORE"}) features = features.join(nchronic) sql = """ SELECT pop.riipl_id, COUNT(DISTINCT p.proc_cde) AS medicaid_procedures FROM {population} pop LEFT JOIN {lookback} lb ON pop.riipl_id = lb.riipl_id LEFT JOIN {dim_date} dd ON lb.yrmo = dd.yrmo INNER JOIN {proc_cde} p ON pop.riipl_id = p.riipl_id AND dd.date_dt = p.claim_dt GROUP BY pop.riipl_id """.format(**globals()) with Connection() as cxn: nproc = pd.read_sql(sql, cxn._connection, index_col=index) features = features.join(nproc).fillna(0) # Add squared terms for x in [ "MEDICAID_DISEASE_SCORE", "MEDICAID_CHRONIC_SCORE", "MEDICAID_PROCEDURES" ]: features["{}_SQ".format(x)] = features[x] * features[x] labels = { "MEDICAID_DISEASE_SCORE": "Number of AHRQ CCS diseases", "MEDICAID_CHRONIC_SCORE": "Number of AHRQ CCI chronic conditions", "MEDICAID_PROCEDURES": "Number of distinct procedures", "MEDICAID_DISEASE_SCORE_SQ": "Squared number of AHRQ CCS diseases", "MEDICAID_CHRONIC_SCORE_SQ": "Squared number of AHRQ CCI chronic conditions", "MEDICAID_PROCEDURES_SQ": "Squared number of distinct procedures" } SaveFeatures(features, out, manifest, population, labels)