def queryItems(self, options, outputFile): """Query for all clinical item records that fulfill the options criteria and then send the results as tab-delimited output to the outputFile. """ pauseSeconds = float(options.pauseSeconds) query = SQLQuery() query.addSelect("count(order_med_id_coded) as nOrders") query.addSelect("om.med_route, om.medication_id, om.med_description") query.addFrom("starr_datalake2018.order_med as om") if options.descriptionPrefix: query.addWhereOp("om.med_description", "like", options.descriptionPrefix + "%%") # Add wildcard to enabe prefix search if options.medRoutes: query.addWhereIn("om.med_route", options.medRoutes.split(",")) query.addGroupBy("om.medication_id, om.med_description, om.med_route") query.addOrderBy("nOrders desc, om.med_description") formatter = TextResultsFormatter(outputFile) prog = ProgressDots() for row in DBUtil.execute(query, includeColumnNames=True, connFactory=self.connFactory): formatter.formatTuple(row) time.sleep(pauseSeconds) prog.update() prog.printStatus()
def _get_random_patient_list(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. avg_orders_per_patient = self._get_average_orders_per_patient() log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Get numPatientsToQuery random patients who have gotten test. # TODO(sbala): Have option to feed in a seed for the randomness. query = SQLQuery() query.addSelect('pat_id') query.addFrom('stride_order_proc AS sop') query.addWhereIn('proc_code', self._lab_panel) query.addGroupBy('pat_id') # this should be a unique list of patients query.addOrderBy('RANDOM()') query.setLimit(self._num_patients) log.debug('Querying random patient list...') results = DBUtil.execute(query) # Get patient list. random_patient_list = [row[0] for row in results] return random_patient_list
def _get_average_orders_per_patient(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. query = SQLQuery() if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': #TODO: add STRIDE component routine query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') query.addWhereIn("proc_code", [self._lab_panel]) components = self._get_components_in_lab_panel() query.addWhereIn("base_name", components) query.addGroupBy('pat_id') elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') query.addSelect('COUNT(order_proc_id) AS num_orders') query.addFrom('labs') query.addWhereIn(self._varTypeInTable, [self._lab_var]) components = self._get_components_in_lab_panel() query.addWhereIn("base_name", components) query.addGroupBy('pat_id') log.debug('Querying median orders per patient...') results = DBUtil.execute(query) order_counts = [row[1] for row in results] if len(order_counts) == 0: error_msg = '0 orders for lab "%s."' % self._lab_var log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: return numpy.median(order_counts)
def _get_components_in_lab_panel(self): # Initialize DB connection. cursor = self._connection.cursor() # Doing a single query results in a sequential scan through # stride_order_results. To avoid this, break up the query in two. # First, get all the order_proc_ids for proc_code. query = SQLQuery() query.addSelect('order_proc_id') query.addFrom('stride_order_proc') query.addWhereIn('proc_code', [self._lab_panel]) query.addGroupBy('order_proc_id') log.debug('Querying order_proc_ids for %s...' % self._lab_panel) results = DBUtil.execute(query) lab_order_ids = [row[0] for row in results] # Second, get all base_names from those orders. query = SQLQuery() query.addSelect('base_name') query.addFrom('stride_order_results') query.addWhereIn('order_proc_id', lab_order_ids) query.addGroupBy('base_name') log.debug('Querying base_names for order_proc_ids...') results = DBUtil.execute(query) components = [row[0] for row in results] return components
def queryPatients(period, locations, rxCount): log.info( "Select patients fitting criteria in designated time period: (%s,%s)" % period) query = SQLQuery() query.addSelect("med.pat_id") query.addSelect("count(order_med_id)") query.addFrom("stride_mapped_meds as map") query.addFrom("stride_order_med as med") query.addFrom("stride_patient as pat") query.addWhere("analysis_status = 1") query.addWhere("map.medication_id = med.medication_id") query.addWhere("med.pat_id = pat.pat_id") query.addWhere("possible_oncology = 0") query.addWhereIn("patient_location", locations) query.addWhereOp("ordering_datetime", ">", period[0]) query.addWhereOp("ordering_datetime", "<", period[-1]) query.addGroupBy("med.pat_id") query.addHaving("count(order_med_id) >2") results = DBUtil.execute(query) cols = ["patientId", "nOpioidRx"] patientDF = pd.DataFrame(results, columns=cols) #patientDF.set_index("patientId",drop=False,inplace=True); patientDF["periodStart"] = period[0] # Identify this group of patient records return patientDF
def test_buildFeatureMatrix_prePostFeatures(self): """ Test features parameter in addClinicalItemFeatures which allows client to specify they only want .pre* or .post* columns in feature matrix. """ # Verify FeatureMatrixFactory throws Error if patientEpisodeInput # has not been set. with self.assertRaises(ValueError): self.factory.processPatientEpisodeInput() # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB") patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() resultEpisodeIterator = self.factory.getPatientEpisodeIterator() resultPatientEpisodes = list() for episode in resultEpisodeIterator: episode["pat_id"] = int(episode["pat_id"]) episode["order_time"] = DBUtil.parseDateValue( episode["order_time"]) resultPatientEpisodes.append(episode) # Add TestItem100 and TestItem200 clinical item data. self.factory.addClinicalItemFeatures(["TestItem100"], features="pre") self.factory.addClinicalItemFeatures(["TestItem200"], features="post") self.factory.buildFeatureMatrix() resultMatrix = self.factory.readFeatureMatrixFile() expectedMatrix = FM_TEST_OUTPUT[ "test_buildFeatureMatrix_prePostFeatures"] self.assertEqualList(resultMatrix[2:], expectedMatrix)
def test_addTimeCycleFeatures(self): """ Test .addTimeCycleFeatures() """ # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB") patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() # Add time cycle features. self.factory.addTimeCycleFeatures("order_time", "month") self.factory.addTimeCycleFeatures("order_time", "hour") # Verify output. self.factory.buildFeatureMatrix() resultMatrix = self.factory.readFeatureMatrixFile() expectedMatrix = FM_TEST_OUTPUT["test_addTimeCycleFeatures"][ "expectedMatrix"] self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5) # Clean up feature matrix. try: os.remove(self.factory.getMatrixFileName()) except OSError: pass
def _getClinicalItemCounts(self): query = SQLQuery() # SELECT query.addSelect(CLINICAL_ITEM_ID) query.addSelect('COUNT(' + CLINICAL_ITEM_ID + ') as total') # FROM query.addFrom('patient_item') # OTHER query.addGroupBy(CLINICAL_ITEM_ID) query.addOrderBy('total', dir='desc') print(query) print(query.getParams()) DBUtil.runDBScript(self.SCRIPT_FILE, False) results = DBUtil.execute(str(query), query.getParams()) pd.DataFrame(results, columns=[CLINICAL_ITEM_ID, 'count']).to_csv('data_summary_stats/item_counts.csv', index=False)
def _getNonNullLabs(self): query = SQLQuery() # SELECT query.addSelect('proc_code') # FROM query.addFrom('stride_order_proc') # WHERE query.addWhereLike('proc_code', 'LAB%') query.addWhere('abnormal_yn is not null') query.addGroupBy('proc_code') query.addOrderBy('proc_code') results = DBUtil.execute(query) df = pd.DataFrame(results, columns=query.select).to_csv( DATA_FOLDER + 'proc_codes.csv', index=False)
def add_sim_case_column_to(csv): query = SQLQuery() query.addSelect("sim_patient_id") query.addSelect("sim_case_name as sim_case") query.addFrom("sim_grading_key sgk") query.addJoin( "sim_patient_order spo", "sgk.clinical_item_id = spo.clinical_item_id and spo.sim_state_id = sgk.sim_state_id" ) query.addGroupBy("sim_patient_id") query.addGroupBy("sim_case_name") case_names = DBUtil.execute(str(query)) # merge sim_case column csv = pd.merge(csv, pd.DataFrame(case_names, columns=['sim_patient_id', 'sim_case']), left_on='patient', right_on='sim_patient_id') return csv
def _get_random_patient_list(self): #sx: this function is for avoid RANDOM() on the database cursor = self._connection.cursor() # Get average number of results for this lab test per patient. query = SQLQuery() query.addSelect('pat_id') query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') ## query.addWhereIn("base_name", [self._component]) query.addGroupBy('pat_id') log.debug('Querying median orders per patient...') results = DBUtil.execute(query) order_counts = [ row[1] for row in results ] if len(results) == 0: error_msg = '0 orders for component "%s."' % self._component #sx log.critical(error_msg) sys.exit('[ERROR] %s' % error_msg) else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results),self._num_patients]) # inds_random_patients = numpy.random.choice(len(results), size=patient_number_chosen, replace=False) # print 'inds_random_patients:', inds_random_patients pat_IDs_random_patients = [] for ind in inds_random_patients: pat_IDs_random_patients.append(results[ind][0]) # print pat_IDs_random_patients return pat_IDs_random_patients
def queryDrugScreens( patientDF, period, locations ): log.info("Populate drug screens by primary locations"); query = SQLQuery(); query.addSelect("pat_id"); query.addSelect("count(distinct order_proc_id)"); query.addFrom("stride_order_proc_drug_screen"); query.addWhere("ordering_mode = 'Outpatient'"); query.addWhereIn("patient_location", locations ); query.addWhereOp("ordering_date",">", period[0]); query.addWhereOp("ordering_date","<", period[-1]); query.addWhereIn("pat_id", patientDF["patientId"] ); query.addGroupBy("pat_id"); results = DBUtil.execute(query); cols = ["patientId","nDrugScreens"]; newDF = pd.DataFrame(results,columns=cols); patientDF = patientDF.merge(newDF, how="left"); patientDF["nDrugScreens"][np.isnan(patientDF["nDrugScreens"])] = 0; # Populate default values if no data patientDF["nDrugScreens"] = patientDF["nDrugScreens"].astype("int"); # Beware of float conversion somewhere return patientDF;
def _getNumPatientEncountersByMonth(self): # Build query. # SELECT # CAST(EXTRACT(YEAR FROM noted_date) AS INT) AS admit_year, # CAST(EXTRACT(MONTH FROM noted_date) AS INT) AS admit_month # COUNT(DISTINCT pat_enc_csn_id) AS num_encounters, # FROM # stride_dx_list # WHERE # data_source = 'ADMIT_DX' # GROUP BY # admit_year, # admit_month # ORDER BY # admit_year, # admit_month query = SQLQuery() query.addSelect( "CAST(EXTRACT(YEAR FROM noted_date) AS INT) AS admit_year") query.addSelect( "CAST(EXTRACT(MONTH FROM noted_date) AS INT) AS admit_month") query.addSelect("COUNT(DISTINCT pat_enc_csn_id) AS num_encounters") query.addFrom("stride_dx_list") query.addWhereEqual("data_source", 'ADMIT_DX') query.addGroupBy("admit_year") query.addGroupBy("admit_month") query.addOrderBy("admit_year") query.addOrderBy("admit_month") # Execute query and return results. results = DBUtil.execute(query) encountersPerMonth = list() for row in results: admitYear = row[0] admitMonth = row[1] numEncounters = row[2] encountersPerMonth.append((admitYear, admitMonth, numEncounters)) return encountersPerMonth
def fetch_components_in_panel(lab_panel): # Doing a single query results in a sequential scan through # stride_order_results. To avoid this, break up the query in two. # First, get all the order_proc_ids for proc_code. query = SQLQuery() query.addSelect('order_proc_id') query.addFrom('stride_order_proc') query.addWhereIn('proc_code', [lab_panel]) query.addGroupBy('order_proc_id') results = DBUtil.execute(query) lab_order_ids = [row[0] for row in results] # Second, get all base_names from those orders. query = SQLQuery() query.addSelect('base_name') query.addFrom('stride_order_results') query.addWhereIn('order_proc_id', lab_order_ids) query.addGroupBy('base_name') results = DBUtil.execute(query) components = [row[0] for row in results] return components
def _get_average_orders_per_patient(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. query = SQLQuery() query.addSelect('pat_id') query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') query.addWhereIn("proc_code", [self._lab_panel]) components = self._get_components_in_lab_panel() query.addWhereIn("base_name", components) query.addGroupBy('pat_id') log.debug('Querying median orders per patient...') results = DBUtil.execute(query) order_counts = [ row[1] for row in results ] if len(order_counts) == 0: error_msg = '0 orders for lab panel "%s."' % self._lab_panel log.critical(error_msg) sys.exit('[ERROR] %s' % error_msg) else: return numpy.median(order_counts)
def _getAdmitDxPatientFrequencyRankByYear(self): # Get list of all clinical item IDs matching admit diagnosis. # Get this list in advance to make subsequent query run a bit faster. admitDxClinicalItemIds = self._getAdmitDxClinicalItemIds() # Build query for # of unique patients. # SELECT # ci.name AS icd_code, # ci.description AS admit_dx, # EXTRACT(YEAR FROM pi.item_date) AS admit_year, # COUNT(DISTINCT pi.patient_id) AS num_unique_patients, # FROM # patient_item AS pi # JOIN # clinical_item AS ci # ON # pi.clinical_item_id = ci.clinical_item_id # WHERE # ci.clinical_item_id in (admitDxClinicalItemIds) # GROUP BY # icd_code, # admit_dx, # admit_year # num_unique_patients # ORDER BY # admit_year, # num_unique_patients DESC query = SQLQuery() query.addSelect("ci.name AS icd_code") query.addSelect("ci.description AS admit_dx") query.addSelect("EXTRACT(YEAR FROM pi.item_date) AS admit_year") query.addSelect("COUNT(DISTINCT pi.patient_id) AS num_unique_patients") query.addFrom("patient_item AS pi") query.addJoin("clinical_item AS ci", "pi.clinical_item_id = ci.clinical_item_id") query.addWhereIn("ci.clinical_item_id", admitDxClinicalItemIds) query.addGroupBy("icd_code") query.addGroupBy("admit_dx") query.addGroupBy("admit_year") query.addGroupBy("num_unique_patients") query.addOrderBy("icd_code") query.addOrderBy("admit_year") query.addOrderBy("num_unique_patients DESC") # Execute query. results = DBUtil.execute(query)
def _get_components_in_lab_panel(self): if not self._isLabPanel: return [self._lab_var] # Initialize DB connection. cursor = self._connection.cursor() if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # Doing a single query results in a sequential scan through # stride_order_results. To avoid this, break up the query in two. # First, get all the order_proc_ids for proc_code. query = SQLQuery() query.addSelect('order_proc_id') query.addFrom('stride_order_proc') query.addWhereIn('proc_code', [self._lab_var]) query.addGroupBy('order_proc_id') log.debug('Querying order_proc_ids for %s...' % self._lab_var) results = DBUtil.execute(query) lab_order_ids = [row[0] for row in results] # Second, get all base_names from those orders. query = SQLQuery() query.addSelect('base_name') query.addFrom('stride_order_results') query.addWhereIn('order_proc_id', lab_order_ids) query.addGroupBy('base_name') else: # elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': query = SQLQuery() query.addSelect('base_name') query.addFrom('labs') query.addWhereIn('proc_code', [self._lab_var]) query.addGroupBy('base_name') log.debug('Querying base_names for order_proc_ids...') results = DBUtil.execute(query) components = [row[0] for row in results] return components
def _query_patient_episodes(self): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() #print("RANDOM PATIENT LIST", random_patient_list) # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. query = SQLQuery() query.addSelect('CAST(pat_anon_id AS BIGINT) AS pat_id') query.addSelect('CAST(shifted_order_time AS TIMESTAMP)') query.addSelect('stride_culture_micro.proc_code') query.addSelect('organism_name') #one for the result # Let us look at top 10 commonly occuring bacteria query.addSelect( "CASE WHEN organism_name IS NULL THEN 1 ELSE 0 END AS NO_BACTERIA") query.addSelect( "CASE WHEN organism_name = 'ESCHERICHIA COLI' THEN 1 ELSE 0 END AS ESCHERICHIA_COLI" ) query.addSelect( "CASE WHEN organism_name = 'STAPHYLOCOCCUS AUREUS' THEN 1 ELSE 0 END AS STAPHYLOCOCCUS_AUREUS" ) query.addSelect( "CASE WHEN organism_name = 'ENTEROCOCCUS SPECIES' THEN 1 ELSE 0 END AS ENTEROCOCCUS_SPECIES" ) query.addSelect( "CASE WHEN organism_name = 'KLEBSIELLA PNEUMONIAE' THEN 1 ELSE 0 END AS KLEBSIELLA_PNEUMONIAE" ) query.addSelect( "CASE WHEN organism_name = 'PSEUDOMONAS AERUGINOSA' THEN 1 ELSE 0 END AS PSEUDOMONAS_AERUGINOSA" ) query.addSelect( "CASE WHEN organism_name = 'COAG NEGATIVE STAPHYLOCOCCUS' THEN 1 ELSE 0 END AS COAG_NEGATIVE_STAPHYLOCOCCUS" ) query.addSelect( "CASE WHEN organism_name = 'ENTEROCOCCUS FAECALIS' THEN 1 ELSE 0 END AS ENTEROCOCCUS_FAECALIS" ) query.addSelect( "CASE WHEN organism_name = 'PROTEUS MIRABILIS' THEN 1 ELSE 0 END AS PROTEUS_MIRABILIS" ) query.addSelect( "CASE WHEN organism_name = 'CANDIDA ALBICANS' THEN 1 ELSE 0 END AS CANDIDA_ALBICANS" ) query.addFrom('stride_culture_micro') query.addWhereIn("stride_culture_micro.proc_code", [self._lab_panel]) query.addWhereIn("pat_id", random_patient_list) query.addGroupBy('pat_id') query.addGroupBy('shifted_order_time') query.addGroupBy('stride_culture_micro.proc_code') query.addGroupBy('organism_name') query.addOrderBy('pat_id') query.addOrderBy('shifted_order_time') query.addOrderBy('stride_culture_micro.proc_code') query.addOrderBy('organism_name') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query, pat_id_col='pat_id', index_time_col='shifted_order_time')
def _query_patient_episodes(self): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': query = SQLQuery() ''' pat_id: hashed patient id ''' query.addSelect('CAST(pat_id AS BIGINT) as pat_id') ''' order_proc_id: unique identifier for an episode ''' query.addSelect('sop.order_proc_id') ''' self._varTypeInTable: usually proc_code or base_name, the column of the lab to be queried ''' query.addSelect(self._varTypeInTable) ''' order_time: The time of the order. Note that sor table does not have this info. ''' query.addSelect('order_time') ''' y-labels related columns, choose one to predict (for now, use all_components_normal to predict). ''' if self._isLabPanel: query.addSelect( "CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel" ) # query.addSelect( "SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components" ) # sx query.addSelect( "SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components" ) # sx query.addSelect( "CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal" ) # sx else: query.addSelect( "CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 0 ELSE 1 END AS component_normal" ) ''' Relevant tables. Note that sor table does not have patient_id info; need to join sop to obtain it. ''' query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') ''' Condition: self._time_limit[0] < order_time < self._time_limit[1] ''' if self._time_limit: if self._time_limit[0]: query.addWhere("sop.order_time > '%s'" % self._time_limit[0]) if self._time_limit[1]: query.addWhere("sop.order_time < '%s'" % self._time_limit[1]) query.addWhere( "(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)" ) query.addWhereIn(self._varTypeInTable, [self._lab_var]) # sx query.addWhereIn("pat_id", random_patient_list) query.addGroupBy('pat_id') query.addGroupBy('sop.order_proc_id') query.addGroupBy(self._varTypeInTable) query.addGroupBy('order_time') if not self._isLabPanel: query.addGroupBy('result_flag') query.addGroupBy('abnormal_yn') # query.addOrderBy('pat_id') query.addOrderBy('sop.order_proc_id') query.addOrderBy(self._varTypeInTable) query.addOrderBy('order_time') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query, index_time_col='order_time') else: ''' Sqlite3 has an interesting limit for the total number of place_holders in a query, and this limit varies across platforms/operating systems (500-99999 on mac, 999 by defaulty). To avoid this problem when querying 1000-10000 patient ids, use string queries instead of the default (convenient) routine in DBUtil. ''' query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id, order_proc_id, %s, order_time, " % self._varTypeInTable if not self._isLabPanel: query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal " else: query_str += "CAST(SUM(CASE WHEN result_in_range_yn != 'Y' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal " query_str += "FROM labs " query_str += "WHERE %s = '%s' " % (self._varTypeInTable, self._lab_var) query_str += "AND pat_id IN " pat_list_str = "(" for pat_id in random_patient_list: pat_list_str += str(pat_id) + "," pat_list_str = pat_list_str[:-1] + ") " query_str += pat_list_str query_str += "GROUP BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable query_str += "ORDER BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable query_str += "LIMIT %d" % self._num_requested_episodes self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query_str, index_time_col='order_time')
def _get_random_patient_list(self): # Initialize DB cursor. cursor = self._connection.cursor() query = SQLQuery() query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': if self._isLabPanel: query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') if self._time_limit: if self._time_limit[0]: query.addWhere("sop.order_time > '%s'" % self._time_limit[0]) if self._time_limit[1]: query.addWhere("sop.order_time < '%s'" % self._time_limit[1]) query.addWhere('sop.order_proc_id = sor.order_proc_id') query.addWhereIn('proc_code', [self._lab_var]) ''' sbala: Technically it's possible for someone to get a lab ordered without getting results ''' query.addWhereIn("base_name", self._lab_components) else: query.addSelect('COUNT(sor.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') ## query.addWhereIn("base_name", [self._lab_var]) elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id , " query_str += "COUNT(order_proc_id) AS num_orders " query_str += "FROM labs " # query_str += " WHERE %s IN (%s) "%(self._varTypeInTable, self._lab_var) query_str += "WHERE %s = '%s' " % (self._varTypeInTable, self._lab_var) if self.notUsePatIds: query_str += "AND pat_id NOT IN (" for pat_id in self.notUsePatIds: query_str += "%s," % pat_id query_str = query_str[:-1] + ") " # get rid of comma query_str += "GROUP BY pat_id" log.debug('Querying median orders per patient...') # TODO: best way to integrate UMich code results = DBUtil.execute(query_str) order_counts = [row[1] for row in results] if len(results) == 0: error_msg = '0 orders for order "%s."' % self._lab_var # sx log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results), self._num_patients]) # inds_random_patients = numpy.random.choice( len(results), size=patient_number_chosen, replace=False) # print 'inds_random_patients:', inds_random_patients pat_IDs_random_patients = [] for ind in inds_random_patients: pat_IDs_random_patients.append(results[ind][0]) return pat_IDs_random_patients else: query.addSelect('COUNT(order_proc_id) AS num_orders') query.addFrom('labs') if self._isLabPanel: query.addWhereIn("proc_code", [self._lab_var]) query.addWhereIn("base_name", self._lab_components) else: query.addWhereIn("base_name", [self._lab_var]) ''' Fo hold-out set, do not use the patients already used in training/validation. ''' if self._notUsePatIds: query.addWhereNotIn('pat_id', self._notUsePatIds) query.addGroupBy('pat_id') log.debug('Querying the number of orders per patient...') results = DBUtil.execute(query) order_counts = [row[1] for row in results] if len(results) == 0: error_msg = '0 orders for component "%s."' % self._lab_var # sx log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results), self._num_patients]) # ''' Set seed to ensure re-producibility of patient episodes. Recover int random_state here, since numpy requires int while sql requires [-1,1] ''' numpy.random.seed(int(self._random_state * float(sys.maxsize))) inds_random_patients = numpy.random.choice( len(results), size=patient_number_chosen, replace=False) pat_IDs_random_patients = [ results[ind][0] for ind in inds_random_patients ] return pat_IDs_random_patients
pat_id, medication_id, description """ DATA_QUERY = SQLQuery(); DATA_QUERY.addSelect("pat_id") DATA_QUERY.addSelect("medication_id") DATA_QUERY.addSelect("description") DATA_QUERY.addSelect("COUNT(*) as medication_co_occurrence") DATA_QUERY.addFrom("stride_order_med") DATA_QUERY.addWhere("pat_id = '{0}'".format(pat_id)) DATA_QUERY.addWhere("ordering_date >= '{0}'".format(window_lower)) DATA_QUERY.addWhere("ordering_date <= '{0}'".format(window_upper)) if (pat_id in orderset_order_med_ids.keys()): DATA_QUERY.addWhere("order_med_id NOT IN ({0})".format(order_med_id_str)) # do not count order instances ordered with an order set; we only want a la carte instances DATA_QUERY.addGroupBy("pat_id") DATA_QUERY.addGroupBy("medication_id") DATA_QUERY.addGroupBy("description") # print(DATA_QUERY) results = DBUtil.execute(DATA_QUERY); for line in results: medication = int(line[1]) description = line[2] count = int(line[3]) order_description_map[medication] = description.replace(",", ";") # consider only orders already in orderset if (orderset in orderset_orders and medication in orderset_orders[orderset].keys()): # medication is in order set if (medication not in co_occurrence_medication_counts):
def main(argv): medIdsByActiveRx = dict() medIdsByActiveRx['Buprenorphine'] = ('125498', '114474', '212560', '114475', '114467', '114468') medIdsByActiveRx['Fentanyl Patch'] = ('2680', '27908', '125379', '27905', '27906', '540107', '540638', '540101', '27907') medIdsByActiveRx['Methadone'] = ('540483', '4953', '4951', '10546', '214468', '15996', '41938', '4954', '4952') medIdsByActiveRx['Hydrocodone'] = ('3724', '4579', '8576', '8577', '8951', '10204', '12543', '13040', '14963', '14965', '14966', '17061', '17927', '19895', '20031', '28384', '29486', '29487', '34505', '34544', '35613', '117862', '204249', '206739') medIdsByActiveRx['Hydromorphone'] = ('2458', '2459', '2464', '2465', '3757', '3758', '3759', '3760', '3761', '10224', '10225', '10226', '10227', '200439', '201094', '201096', '201098', '540125', '540179', '540666') medIdsByActiveRx['Morphine'] = ( '5167', '5168', '5172', '5173', '5176', '5177', '5178', '5179', '5180', '5183', '6977', '10655', '15852', '20908', '20909', '20910', '20914', '20915', '20919', '20920', '20921', '20922', '29464', '30138', '31413', '36140', '36141', '79691', '87820', '89282', '91497', '95244', '96810', '112562', '112564', '115335', '115336', '126132', '198543', '198544', '198623', '201842', '201848', '205011', '206731', '207949', '208896', '540182', '540300') medIdsByActiveRx['Oxycodone'] = ('5940', '5941', '6122', '6981', '10812', '10813', '10814', '14919', '16121', '16123', '16129', '16130', '19187', '26637', '26638', '27920', '27921', '27922', '27923', '28897', '28899', '28900', '31851', '31852', '31863', '31864', '92248', '126939', '200451', '203690', '203691', '203692', '203705', '203706', '203707', '204020', '204021') query = baseQuery() totalPatients = float(DBUtil.execute(query)[0][0]) # print"Total Patients\t%s" % totalPatients # print"======= Dx Groups ==========="; # print"Dx Group\tPt Count\tDx Rate"; patientsPerDxGroup = dict() query = SQLQuery() query.addSelect("count(distinct prob.pat_id) as ptCount") query.addSelect("prob.%s" % DX_COL) query.addFrom("stride_problem_list as prob") query.addWhereOp("prob.noted_date", "<", LIMIT_DATE) query.addGroupBy("prob.%s" % DX_COL) if DOUBLE_DX: query.addSelect("prob2.%s" % DX_COL) query.addFrom("stride_problem_list as prob2") query.addWhere("prob.pat_id = prob2.pat_id") query.addWhereOp("prob2.noted_date", "<", LIMIT_DATE) query.addGroupBy("prob2.%s" % DX_COL) results = DBUtil.execute(query) for row in results: patientCount = row[0] dxGroup = row[1] if DOUBLE_DX: dxGroup = (dxGroup, row[2]) # Composite tuple including second diagnosis patientsPerDxGroup[dxGroup] = patientCount progress = ProgressDots() for activeRx, medIds in medIdsByActiveRx.iteritems(): query = baseQuery() query.addWhereIn("medication_id", medIds) # Baseline prescription rates rxPtCount = DBUtil.execute(query)[0][0] # print"====== Rx Counts ======"; # print"Rx\tPt Count\tRx Rate"; # print"%s\t%s\t%s" % (activeRx, rxPtCount, (rxPtCount/totalPatients)); # print"======== Rx-Dx Association ========"; statIds = ( "P-Fisher", "P-YatesChi2", "oddsRatio", "relativeRisk", "interest", "LR+", "LR-", "sensitivity", "specificity", "PPV", "NPV", ) if progress.getCounts() == 0: headerCols = [ "Rx", "Dx", "RxDxCount", "RxCount", "DxCount", "Total" ] if DOUBLE_DX: headerCols.insert(2, "Dx2") headerCols.extend(statIds) headerStr = str.join("\t", headerCols) print headerStr # Query out per diagnosis group, but do as aggregate grouped query query.addSelect("prob.%s" % DX_COL) query.addFrom("stride_problem_list as prob") query.addWhere("med.pat_id = prob.pat_id") query.addWhereOp("prob.noted_date", "<", LIMIT_DATE) #query.addWhereIn("prob.%s" % DX_COL, dxKeys ); query.addGroupBy("prob.%s" % DX_COL) if DOUBLE_DX: query.addSelect("prob2.%s" % DX_COL) query.addFrom("stride_problem_list as prob2") query.addWhere("prob.pat_id = prob2.pat_id") query.addWhereOp("prob2.noted_date", "<", LIMIT_DATE) query.addGroupBy("prob2.%s" % DX_COL) results = DBUtil.execute(query) for row in results: rxDxPtCount = row[0] dxGroup = row[1] if DOUBLE_DX: dxGroup = (dxGroup, row[2]) # Composite tuple including second diagnosis dxPtCount = patientsPerDxGroup[dxGroup] conStats = ContingencyStats(rxDxPtCount, rxPtCount, dxPtCount, totalPatients) dataCells = [ activeRx, dxGroup, rxDxPtCount, rxPtCount, dxPtCount, totalPatients ] if DOUBLE_DX: dataCells[1] = dxGroup[0] dataCells.insert(2, dxGroup[1]) for statId in statIds: try: dataCells.append(conStats[statId]) except ZeroDivisionError: dataCells.append(None) for i, value in enumerate(dataCells): dataCells[i] = str(value) # String conversion to allow for concatenation below dataStr = str.join("\t", dataCells) print dataStr progress.update() progress.printStatus()
def _query_patient_episodes(self): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() self.usedPatIds = random_patient_list[:] # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # TODO: component query = SQLQuery() query.addSelect('CAST(pat_id AS BIGINT)') query.addSelect('sop.order_proc_id AS order_proc_id') query.addSelect('proc_code') #TODO:sx query.addSelect('order_time') query.addSelect( "CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel" ) #sx query.addSelect( "SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components" ) #sx query.addSelect( "SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components" ) #sx query.addSelect( "CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal" ) #sx query.addFrom('stride_order_proc AS sop') #sx query.addFrom('stride_order_results AS sor') #sx query.addWhere('sop.order_proc_id = sor.order_proc_id') #sx query.addWhere( "(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)" ) query.addWhereIn("proc_code", [self._lab_var]) #sx query.addWhereIn("pat_id", random_patient_list) query.addGroupBy('pat_id') query.addGroupBy('sop.order_proc_id') #sx query.addGroupBy('proc_code') #sx query.addGroupBy('order_time') query.addGroupBy('abnormal_yn') #sx query.addOrderBy('pat_id') query.addOrderBy('sop.order_proc_id') #sx query.addOrderBy('proc_code') #sx query.addOrderBy('order_time') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query, index_time_col='order_time') elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': # query = SQLQuery() # query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') # query.addSelect('order_proc_id') # query.addSelect(self._varTypeInTable) # query.addSelect('order_time') # # query.addWhereIn(self._varTypeInTable, [self._lab_var]) # # if self._isLabPanel: # # query.addSelect("SUM(CASE WHEN result_in_range_yn IN ('N', 'Y') THEN 1 ELSE 0 END) AS num_components") # query.addSelect("SUM(CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END) AS num_normal_components") # query.addSelect("CAST(SUM(CASE WHEN result_in_range_yn = 'N' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal") #TODO # else: # query.addSelect("CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal") # # query.addFrom('labs') # query.addWhereIn("pat_id", random_patient_list) # # query.addGroupBy('pat_id') # query.addGroupBy('order_proc_id') # query.addGroupBy(self._varTypeInTable) # query.addGroupBy('order_time') # # query.addOrderBy('pat_id') # query.addOrderBy('order_proc_id') # query.addOrderBy(self._varTypeInTable) # query.addOrderBy('order_time') # # query.setLimit(self._num_requested_episodes) # # print query query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id, order_proc_id, %s, order_time, " % self._varTypeInTable if not self._isLabPanel: query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal " else: query_str += "CAST(SUM(CASE WHEN result_in_range_yn != 'Y' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal " # query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal " query_str += "FROM labs " query_str += "WHERE %s = '%s' " % (self._varTypeInTable, self._lab_var) query_str += "AND pat_id IN " pat_list_str = "(" for pat_id in random_patient_list: pat_list_str += str(pat_id) + "," pat_list_str = pat_list_str[:-1] + ") " query_str += pat_list_str query_str += "GROUP BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable query_str += "ORDER BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable query_str += "LIMIT %d" % self._num_requested_episodes self._num_reported_episodes = FeatureMatrix._querystr_patient_episodes( self, query_str, index_time_col='order_time')
def _query_patient_episodes(self, pat_id_col="pat_deid"): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() print("RANDOM PATIENT LIST", random_patient_list) # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. query = SQLQuery() #query.addSelect('CAST(micro.pat_deid AS BIGINT)') query.addSelect('CAST(micro.order_time AS TIMESTAMP)') #query.addSelect('sop.order_proc_id AS order_proc_id') query.addSelect('proc_code') query.addSelect('order_time') query.addSelect('organism_name') #one for the result # query.addSelect("CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel") # query.addSelect("SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components") # query.addSelect("SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components") # query.addSelect("CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal") # query.addFrom('stride_order_proc AS sop') # query.addFrom('stride_order_results AS sor') query.addFrom('micro') #query.addWhere('sop.order_proc_id = sor.order_proc_id') #query.addWhere("(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)") #query.addWhereIn("micro.proc_code", [self._lab_panel]) query.addWhereIn("micro.pat_deid", random_patient_list) query.addGroupBy('micro.pat_deid') #query.addGroupBy('sop.order_proc_id') query.addGroupBy('proc_code') query.addGroupBy('order_time') query.addGroupBy('organism_name') #query.addGroupBy('abnormal_yn') query.addOrderBy('micro.pat_deid') #query.addOrderBy('sop.order_proc_id') query.addOrderBy('proc_code') query.addOrderBy('micro.order_time') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes(self, query, index_time_col='order_time')
def grade_cases(self, sim_patient_ids, sim_grader_id, conn=None): """Given the identifiers for a bunch of simulated physician-patient case records, and the identifier for a particular grading key to use, calculate what grade each case would get based on the choices made and return a dictionary of case grades (keyed by the case ID). """ ext_conn = True if conn is None: conn = self.connFactory.connection() ext_conn = False try: # Inner query retrieves physician-patient cases with ranking group_names (to later select first) # per case for specified cases. Each NULL group_name is treated as a separate group by assigning it # sim_patient_order_id. It also omits Default user (sim_user_id = 0) from grading. inner_query = SQLQuery() inner_query.addSelect("score") inner_query.addSelect( "rank() over (" # ranks rows incrementally in the same group " partition by coalesce(group_name, sim_patient_order_id::text), sim_patient_id" " order by sim_patient_order_id" ")") inner_query.addSelect("sim_user_id") inner_query.addSelect("sim_patient_id") inner_query.addSelect("sim_grader_id") inner_query.addFrom("sim_patient_order spo") inner_query.addJoin( "sim_grading_key sgk", "sgk.clinical_item_id = spo.clinical_item_id" " and sgk.sim_state_id = spo.sim_state_id") inner_query.addWhereEqual("sgk.sim_grader_id", sim_grader_id) inner_query.addWhereNotEqual( "spo.sim_user_id", 0) # 0 = ignore 'Default user', sets up initial cases inner_query.addWhereIn("spo.sim_patient_id", sim_patient_ids) inner_query.addOrderBy("relative_time_start") inner_query.addOrderBy("sim_patient_order_id") # Outer query sums the score per patient case and selects most graded physician for the case. # Theoretically, it isn't necessarily the most active physician for the case since his orders # might have been dropped by selecting only the first record within group_name group. query = SQLQuery() query.addSelect("sim_patient_id") query.addSelect("sim_grader_id") query.addSelect("sum(score) as total_score") query.addSelect( "mode() within group (" # mode() selects most frequent value within group " order by sim_user_id" ") as most_graded_user_id") query.addFrom("(" + str(inner_query) + ") as ranked_groups") query.addWhereEqual("ranked_groups.rank", 1) # count only first order in the same group query.addGroupBy("sim_patient_id") query.addGroupBy("sim_grader_id") query_params = inner_query.getParams() + query.getParams() grades_table = DBUtil.execute(query, query_params, includeColumnNames=True, conn=conn) grades_model = modelListFromTable(grades_table) # get most active users for the cases most_active_user_query = SQLQuery() most_active_user_query.addSelect("sim_patient_id") most_active_user_query.addSelect("mode() within group (" " order by sim_user_id" ") as most_active_user_id") most_active_user_query.addFrom("sim_patient_order") most_active_user_query.addWhereNotEqual("sim_user_id", 0) # ignore Default user most_active_user_query.addWhereIn("sim_patient_id", sim_patient_ids) most_active_user_query.addGroupBy("sim_patient_id") most_active_user_query.addOrderBy("sim_patient_id") most_active_user_table = DBUtil.execute(most_active_user_query, includeColumnNames=True, conn=conn) most_active_user_model = modelListFromTable(most_active_user_table) # make a dict by sim_patient_id out of results - will be used for combining most_active_user_dict = { most_active_user["sim_patient_id"]: most_active_user for most_active_user in most_active_user_model } # combine results complete_grades = [ grade.update(most_active_user_dict[grade["sim_patient_id"]]) for grade in grades_model ] return complete_grades finally: if not ext_conn: conn.close()
def _query_patient_episodes(self): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() #print("RANDOM PATIENT LIST", random_patient_list) # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. query = SQLQuery() query.addSelect('CAST(pat_anon_id AS BIGINT) AS pat_id') query.addSelect('CAST(shifted_order_time AS TIMESTAMP)') query.addSelect('stride_culture_micro.proc_code') query.addSelect('organism_name') #one for the result # Experimenting susceptibility_flags = [ 'Trimethoprim/Sulfamethoxazole', 'Vancomycin', 'Penicillin', 'Levofloxacin', 'Clindamycin', 'Ceftriaxone', 'Erythromycin', 'Ampicillin', 'Meropenem', 'Ciprofloxacin', 'Cefepime', 'Aztreonam.', 'Ampicillin/Sulbactam', 'Piperacillin/Tazobactam', 'Linezolid', 'Oxacillin.', 'Cefazolin', 'Daptomycin' ] for med in susceptibility_flags: med_col = med.replace('/', '_').replace('.', '') query.addSelect( "MAX(CASE WHEN antibiotic_name = '%s' AND (suseptibility = 'Susceptible' OR suseptibility = 'Positive') THEN 1 ELSE 0 END) AS %s_Susc" % (med, med_col)) query.addSelect( "MAX(CASE WHEN antibiotic_name = '%s' THEN 1 ELSE 0 END) as %s_tested" % (med, med_col)) # Let us look at top 10 commonly occuring bacteria query.addSelect( "CASE WHEN organism_name IS NULL THEN 0 ELSE 1 END AS BACTERIA_PRESENT" ) query.addSelect( "CASE WHEN organism_name = 'ESCHERICHIA COLI' THEN 1 ELSE 0 END AS ESCHERICHIA_COLI" ) query.addSelect( "CASE WHEN organism_name = 'STAPHYLOCOCCUS AUREUS' THEN 1 ELSE 0 END AS STAPHYLOCOCCUS_AUREUS" ) query.addSelect( "CASE WHEN organism_name = 'ENTEROCOCCUS SPECIES' THEN 1 ELSE 0 END AS ENTEROCOCCUS_SPECIES" ) query.addSelect( "CASE WHEN organism_name = 'KLEBSIELLA PNEUMONIAE' THEN 1 ELSE 0 END AS KLEBSIELLA_PNEUMONIAE" ) query.addSelect( "CASE WHEN organism_name = 'PSEUDOMONAS AERUGINOSA' THEN 1 ELSE 0 END AS PSEUDOMONAS_AERUGINOSA" ) query.addSelect( "CASE WHEN organism_name = 'COAG NEGATIVE STAPHYLOCOCCUS' THEN 1 ELSE 0 END AS COAG_NEGATIVE_STAPHYLOCOCCUS" ) query.addSelect( "CASE WHEN organism_name = 'ENTEROCOCCUS FAECALIS' THEN 1 ELSE 0 END AS ENTEROCOCCUS_FAECALIS" ) query.addSelect( "CASE WHEN organism_name = 'PROTEUS MIRABILIS' THEN 1 ELSE 0 END AS PROTEUS_MIRABILIS" ) query.addSelect( "CASE WHEN organism_name = 'CANDIDA ALBICANS' THEN 1 ELSE 0 END AS CANDIDA_ALBICANS" ) query.addFrom('stride_culture_micro') query.addWhereIn("stride_culture_micro.proc_code", self._lab_panel) query.addWhereIn("pat_anon_id", random_patient_list) query.addGroupBy('pat_anon_id') query.addGroupBy('shifted_order_time') query.addGroupBy('stride_culture_micro.proc_code') query.addGroupBy('organism_name') query.addOrderBy('pat_anon_id') query.addOrderBy('shifted_order_time') query.addOrderBy('stride_culture_micro.proc_code') query.addOrderBy('organism_name') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query, pat_id_col='pat_id', index_time_col='shifted_order_time')
def test_processPatientListInput(self): """Test processPatientListInput().""" # Verify FeatureMatrixFactory throws Error if patientListInput # has not been set. with self.assertRaises(ValueError): self.factory.processPatientListInput() # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patients. patientListQuery = SQLQuery() patientListQuery.addSelect("CAST(pat_id AS bigint)") patientListQuery.addFrom("stride_order_proc") patientListQuery.addWhere("proc_code = 'LABMETB'") patientListQuery.addGroupBy("pat_id") patientListQuery.addOrderBy("1 ASC") cursor.execute(str(patientListQuery), patientListQuery.params) # Set and process patientListInput. self.factory.setPatientListInput(cursor, "pat_id") self.factory.processPatientListInput() resultPatientIterator = self.factory.getPatientListIterator() # Verify results. expectedPatientList = ["-789", "-456", "-123"] for expectedPatientId in expectedPatientList: resultPatientId = resultPatientIterator.next()['pat_id'] self.assertEqual(resultPatientId, expectedPatientId) resultPatientIterator.close() # Build TSV file for list of patients. patientList = \ "patient_item_id\tpatient_id\tclinical_item_id\titem_date\n\ -1000\t-123\t-100\t10/6/2113 10:20\n\ -2000\t-123\t-200\t10/6/2113 11:20\n\ -2500\t-123\t-100\t10/7/2113 11:20\n\ -3000\t-456\t-100\t11/6/2113 10:20\n\ -6000\t-789\t-200\t12/6/2113 11:20\n" patientListTsv = open("patient_list.tsv", "w") patientListTsv.write(patientList) patientListTsv.close() # Initialize new FeatureMatrixFactory. self.factory = FeatureMatrixFactory() # Set and process patientListInput. patientListTsv = open("patient_list.tsv", "r") self.factory.setPatientListInput(patientListTsv, "patient_id") self.factory.processPatientListInput() resultPatientIterator = self.factory.getPatientListIterator() # Verify results. expectedPatientList = ["-123", "-123", "-123", "-456", "-789"] for expectedPatientId in expectedPatientList: resultPatientId = resultPatientIterator.next()['patient_id'] self.assertEqual(resultPatientId, expectedPatientId) patientListTsv.close() resultPatientIterator.close() # Clean up patient_list. try: os.remove("patient_list.tsv") os.remove("fmf.patient_list.tsv") except OSError: pass
def test_buildFeatureMatrix_multiFlowsheet(self): """ Test buildFeatureMatrix and addFlowsheet. """ # Verify FeatureMatrixFactory throws Error if patientEpisodeInput # has not been set. with self.assertRaises(ValueError): self.factory.processPatientEpisodeInput() # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB") patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() resultEpisodeIterator = self.factory.getPatientEpisodeIterator() resultPatientEpisodes = list() for episode in resultEpisodeIterator: episode["pat_id"] = int(episode["pat_id"]) episode["order_time"] = DBUtil.parseDateValue( episode["order_time"]) resultPatientEpisodes.append(episode) # Verify results (note sort order). expectedPatientEpisodes = FM_TEST_OUTPUT[ "test_processPatientEpisodeInput"] self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes) # Add flowsheet features. flowsheetNames = ["Resp", "FiO2", "Glasgow Coma Scale Score"] # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) self.factory.addFlowsheetFeatures(flowsheetNames, preTimeDelta, postTimeDelta) self.factory.buildFeatureMatrix() resultMatrix = self.factory.readFeatureMatrixFile() # Verify results. expectedMatrix = FM_TEST_OUTPUT[ "test_buildFeatureMatrix_multiFlowsheet"]["expectedMatrix"] self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5) try: os.remove(self.factory.getMatrixFileName()) except OSError: pass
def test_performance(self): """ Test performance against DataExtractor. """ # Initialize DB cursor. cursor = self.connection.cursor() # Initialize FeatureMatrixFactory. factoryStart = time.time() self.factory = FeatureMatrixFactory() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereIn("proc_code", ["Foo", "Bar", "Baz", "Qux"]) patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) # Add clinical item features. self.factory.addClinicalItemFeatures(["PerfItem300"]) self.factory.addClinicalItemFeatures(["PerfItem400"]) self.factory.addClinicalItemFeatures(["PerfItem500"]) # Add lab result features. self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta, postTimeDelta) # Add flowsheet features. self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta, postTimeDelta) # Build matrix. self.factory.buildFeatureMatrix() # Stop timer. factoryStop = time.time() # Initialize DataExtractor. extractorStart = time.time() extractor = DataExtractor() extractor.dataCache = dict() # Initialize output file. outFile = open("extractor.feature_matrix.tab.gz", "w") formatter = TextResultsFormatter(outFile) # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereIn("proc_code", ["Foo", "Bar", "Baz", "Qux"]) patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Process patient episodes. patientEpisodes = list() row = cursor.fetchone() while row is not None: (pat_id, order_proc_id, proc_code, order_time, normal_results) = row patientEpisode = \ RowItemModel \ ( { "patient_id": pat_id, "order_proc_id": order_proc_id, "proc_code": proc_code, "order_time": order_time, "result_normal_count": normal_results } ) patientEpisodes.append(patientEpisode) row = cursor.fetchone() # Initialize patient data. lastPatientId = None colNames = None patientEpisodeByIndexTime = None # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) # Populate patient data. tempColNames = \ ["patient_id", "order_proc_id", "proc_code", "order_time", "result_normal_count"] for patientEpisode in patientEpisodes: patientId = patientEpisode["patient_id"] if lastPatientId is not None and lastPatientId != patientId: # New patient ID so start querying for patient specific data and # populating patient episode data. # Clinical Item (PerfItem300) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem300",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem400) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem400",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem500) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem500",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Lab Result (Foo) labResultTable = extractor.queryLabResults(["Foo"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \ preTimeDelta, postTimeDelta)) # Lab Result (Bar) labResultTable = extractor.queryLabResults(["Bar"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \ preTimeDelta, postTimeDelta)) # Lab Result (Baz) labResultTable = extractor.queryLabResults(["Baz"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \ preTimeDelta, postTimeDelta)) # Lab Result (Qux) labResultTable = extractor.queryLabResults(["Qux"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \ preTimeDelta, postTimeDelta)) # Flowsheet (Perflow) # tempFile = StringIO() # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile) # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\ # StringIO(tempFile.getvalue())) # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\ # patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \ # ["Perflow"], preTimeDelta, postTimeDelta, tempColNames)) if colNames is None: # First row, print header row colNames = tempColNames formatter.formatTuple(colNames) # Print out patient (episode) data (one row per episode) formatter.formatResultDicts(patientEpisodeByIndexTime.values(), colNames) if lastPatientId is None or lastPatientId != patientId: # Prepare to aggregate patient episode record per patient patientEpisodeByIndexTime = dict() patientEpisodeByIndexTime[ patientEpisode["order_time"]] = patientEpisode lastPatientId = patientId outFile.flush() # Last Iteration patientId = lastPatientId # Clinical Item (PerfItem300) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem300",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem400) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem400",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem500) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem500",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Lab Result (Foo) labResultTable = extractor.queryLabResults(["Foo"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \ preTimeDelta, postTimeDelta)) # Lab Result (Bar) labResultTable = extractor.queryLabResults(["Bar"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \ preTimeDelta, postTimeDelta)) # Lab Result (Baz) labResultTable = extractor.queryLabResults(["Baz"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \ preTimeDelta, postTimeDelta)) # Lab Result (Qux) labResultTable = extractor.queryLabResults(["Qux"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \ preTimeDelta, postTimeDelta)) formatter.formatResultDicts(patientEpisodeByIndexTime.values(), colNames) # Close file. outFile.close() # Stop timer. extractorStop = time.time() # Compare results. factoryTime = factoryStop - factoryStart extractorTime = extractorStop - extractorStart self.assertTrue(extractorTime > factoryTime) # Clean up feature matrix files. try: os.remove("extractor.feature_matrix.tab.gz") except OSError: pass try: os.remove(self.factory.getMatrixFileName()) except OSError: pass