parser = argparse.ArgumentParser( description= 'A filelist of input / output files, one filename per row no quote marks') parser.add_argument('-B', required=True, help='full path to the Bib') parser.add_argument('-C', required=True, help='full path to the Con') parser.add_argument('-E', required=True, help='full path to the Enc') parser.add_argument('-O', required=True, help='full path to the output file') args = parser.parse_args() fileBib = args.B fileCon = args.C fileEnc = args.E fileout = args.O bib = rp.RPDR_query(name='Bib', filein=fileBib) con = rp.RPDR_query(name='Con', filein=fileCon) enc = rp.RPDR_query(name='Enc', filein=fileEnc) bibdf = bib.lst_2_pd() condf = con.lst_2_pd() encdf = enc.lst_2_pd() encdf['last_visit_date'] = pd.to_datetime(encdf.Admit_Date) # if there might be potential missingness in the data file, run below, might be slow for big data files: # encdf['last_visit_date'] = encdf['Admit_Date'].apply(lambda x: pd.to_datetime(x) if(pd.notnull(x)) else x) # if the date format in all RPDR files is in ISO 8601, another code could be used, might be faster: # encdf['last_visit_date'] = pd.to_datetime(encdf.Admit_Date, format = '%d/%m/%y') encdf = encdf.sort_values(['EMPI', 'last_visit_date']).drop_duplicates(subset='EMPI',
#/data/dgag/projects/GELS/data/RPDR_filelist/filelist_Bib parser = argparse.ArgumentParser( description= 'A filelist of input / output files, one filename per row no quote marks') parser.add_argument('-B', required=True, help='full path to the Bib') parser.add_argument('-C', required=True, help='full path to the Con') parser.add_argument('-D', required=True, help='full path to the Dem') parser.add_argument('-O', required=True, help='full path to the output file') args = parser.parse_args() fileBib = args.B fileCon = args.C fileDem = args.D fileout = args.O bib = rp.RPDR_query(name='Bib', filein=fileBib) con = rp.RPDR_query(name='Con', filein=fileCon) dem = rp.RPDR_query(name='Dem', filein=fileDem) bibdf = bib.lst_2_pd() condf = con.lst_2_pd() demdf = dem.lst_2_pd() cols = { 'Bib': ['Subject_Id', 'EMPI', 'MGH_MRN'], 'Con': ['EMPI', 'Insurance_1', 'Insurance_2', 'Insurance_3'], 'Dem': [ 'EMPI', 'Gender', 'Date_of_Birth', 'Language', 'Race', 'Marital_status', 'Religion', 'Is_a_veteran', 'Vital_status', 'Date_Of_Death' ]
# This file could be used to get the test item information out from Phy and Lab, as a potential reference for keyword search # Dia and Med files could also output Diagnosis information(Non-code) and Medication_Date_Detail import RPDR_parsing as rp import argparse parser = argparse.ArgumentParser(description='A filelist of RPDR filelist, one filename per row no quote marks') parser.add_argument('-F', required=True, help='full path to the filelist.txt') parser.add_argument('-O', required=True, help='full path to the output item data file') parser.add_argument('-N', required=True, help='Name of the RPDR file') args = parser.parse_args() filein = args.F fileout = args.O name = args.N query = rp.RPDR_query(name=name, filein=filein) outdat, header = query.read_data() item = set(line[5] for line in outdat) with open(fileout, 'w') as f: for i in item: f.write(i+'\n') f.close()
required=True, help='full path to the IM clinics json file') parser.add_argument('-O1', required=True, help='full path to the IM EMPI output file') parser.add_argument('-O2', required=True, help='full path to the IM EMPI output file') args = parser.parse_args() fileEnc = args.F itemlist = args.I fileout1 = args.O1 fileout2 = args.O2 enc = rp.RPDR_query(name="Enc", filein=fileEnc) m = rp.matchterm(name="IM_clinics", filein=itemlist) df = rp.read_matched(rpdrobj=enc, matchtermobj=m) imdf = df.groupby(df.EMPI).size().reset_index() imdf = imdf.rename(columns={0: "count"}) im2df = imdf[imdf['count'] > 1] imdf = imdf[['EMPI']] im2df = im2df[['EMPI']] imdf.to_csv(fileout1, index=False, header=False) im2df.to_csv(fileout2, index=False, header=False)
'A filelist of input / output files, one filename per row no quote marks') parser.add_argument('-B', required=True, help='full path to the Bib') parser.add_argument('-C', required=True, help='full path to the Con') parser.add_argument('-E', required=True, help='full path to the Enc') parser.add_argument('-L', required=True, help='Name of the clinics list') parser.add_argument('-O', required=True, help='full path to the output file') args = parser.parse_args() fileBib = args.B fileCon = args.C fileEnc = args.E fileout = args.O itemlist = args.L bib = bib.RPDR_query_Bib(name='Bib', filein=fileBib) con = rp.RPDR_query(name='Con', filein=fileCon) enc = enc.RPDR_query_Enc(name='Enc', filein=fileEnc, itemlist=itemlist) bibdf = bib.lst_2_pd() condf = con.lst_2_pd() encdf = enc.lst_2_pd() encdf['last_visit_date'] = encdf['Admit_Date'].apply( lambda x: pd.to_datetime(x) if (pd.notnull(x)) else x) #encdf['last_visit_date'] = pd.to_datetime(encdf.Admit_Date) encdf.sort_values(['EMPI', 'last_visit_date']).drop_duplicates(subset='EMPI', keep='last') cols = { 'Bib': ['EMPI'],
parser.add_argument('-S', required=True, help='full path to the idlist json file') parser.add_argument('-O1', required=True, help='full path to the BMI output file') parser.add_argument('-O2', required=True, help='full path to the A1C output file') parser.add_argument('-O3', required=True, help='full path to the Glucose output file') args = parser.parse_args() fileBib = args.B filePhy = args.P fileLab = args.L fileout1 = args.O1 fileout2 = args.O2 fileout3 = args.O3 idlist = args.S bib = rp.RPDR_query(name='Bib', filein=fileBib) phy = rp.RPDR_query(name='Phy', filein=filePhy) lab = rp.RPDR_query(name='Lab', filein=fileLab) mbib = rp.matchterm(name="IM", filein=idlist) mbmi_phy = rp.matchterm(name='BMI_Phy', filein=args.PI1) ma1c_phy = rp.matchterm(name='A1c_Phy', filein=args.PI2) mglu_phy = rp.matchterm(name='Glu_Phy', filein=args.PI3) ma1c_lab = rp.matchterm(name='A1c_Lab', filein=args.LI1) mglu_lab = rp.matchterm(name='Glu_Lab', filein=args.LI2) start_date = '1/1/2018' end_date = '12/31/2018' bibdf = rp.read_matched(rpdrobj=bib, matchtermobj=mbib) bmi_phy_df = rp.read_matched(rpdrobj=phy, matchtermobj=mbmi_phy, timevar="Date", sdate=start_date, edate=end_date)
import RPDR_parsing as rp import pandas as pd import timeit filein = '/Volumes/LaCie/PBB/Phy_filelist' phy = rp.RPDR_query(name='Phy', filein=filein) phydat, phyheader = phy.read_data() item = set(line[5] for line in phydat) #phydf = pd.DataFrame(phydat, columns=phyheader)
cols = { 'Bib': ['EMPI', 'Subject_Id'], 'Lab': [ 'EMPI', 'Date', 'Group_Id', 'Result', 'Units', 'Reference_Range', 'DataSource' ], 'Phy': [ 'EMPI', 'Date', 'Concept_Name', 'Result', 'Units', 'Clinic', 'Inpatient_Outpatient', 'DataSource' ], 'Med': ['EMPI', 'Medication_Date', 'Medication'], 'Dia': ['EMPI', 'Date', 'Diagnosis_Name', 'Code_Type', 'Code'] } bib = rp.RPDR_query(name="Bib", filein=fileBib) phy = rp.RPDR_query(name="Phy", filein=filePhy) lab = rp.RPDR_query(name="Lab", filein=fileLab) med = rp.RPDR_query(name="Med", filein=fileMed) dia = rp.RPDR_query(name="Dia", filein=fileDia) mbib = rp.matchterm(name="IM", filein=args.BI) ma1c_phy = rp.matchterm(name="A1c_Phy", filein=args.PI) ma1c_lab = rp.matchterm(name="A1c_Lab", filein=args.LI) mmed1 = rp.matchterm(name="T1DRx", filein=args.MI1) mmed2 = rp.matchterm(name="T2DRx", filein=args.MI2) mdia1 = rp.matchterm(name="T1DDx", filein=args.DI1) mdia2 = rp.matchterm(name="T2DDx", filein=args.DI2) # subset EMPI for subjects with IM visits based on decision tree bibdf = rp.read_matched(rpdrobj=bib, matchtermobj=mbib)
help='full path to the renal failure diagnosis list file') parser.add_argument( '-DC2', required=True, help='full path to the end stage renal failure diagnosis list file') parser.add_argument('-O', required=True, help='full path to the output file') args = parser.parse_args() fileDem = args.Dem filePhy = args.P fileLab = args.L fileMed = args.M fileDia = args.D fileOut = args.O dem = rp.RPDR_query(name="Dem", filein=fileDem) phy = rp.RPDR_query(name='Phy', filein=filePhy) lab = rp.RPDR_query(name='Lab', filein=fileLab) med = rp.RPDR_query(name='Med', filein=fileMed) dia = rp.RPDR_query(name='Dia', filein=fileDia) mid = rp.matchterm(name="IDGRP", filein=args.ID) mwtphy = rp.matchterm(name="Weight", filein=args.PI1) mbmiphy = rp.matchterm(name="BMI", filein=args.PI2) ma1cphy = rp.matchterm(name="A1cPhy", filein=args.PI3) ma1clab = rp.matchterm(name="A1cLab", filein=args.LI1) mcrephy = rp.matchterm(name="crePhy", filein=args.PI4) mcrelab = rp.matchterm(name="creLab", filein=args.LI2) mgfrphy = rp.matchterm(name="gfrPhy", filein=args.PI5) mgfrlab = rp.matchterm(name="gfrLab", filein=args.LI3) malbphy = rp.matchterm(name="ma1bphy", filein=args.PI6)