def _get_random_patient_list(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. avg_orders_per_patient = self._get_average_orders_per_patient() log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Get numPatientsToQuery random patients who have gotten test. # TODO(sbala): Have option to feed in a seed for the randomness. query = SQLQuery() query.addSelect('pat_id') query.addFrom('stride_order_proc AS sop') query.addWhereIn('proc_code', [self._lab_panel]) query.addOrderBy('RANDOM()') query.setLimit(self._num_patients) log.debug('Querying random patient list...') results = DBUtil.execute(query) # Get patient list. random_patient_list = [ row[0] for row in results ] return random_patient_list
def querySourceItems(self, userSIDs, limit=None, offset=None, progress=None, conn=None): """Query the database for list of all AccessLogs and yield the results one at a time. If userSIDs provided, only return items matching those IDs. """ extConn = conn is not None if not extConn: conn = self.connFactory.connection() # Column headers to query for that map to respective fields in analysis table headers = [ "user_id", "user_name", "de_pat_id", "access_datetime", "metric_id", "metric_name", "line_count", "description", "metric_group_num", "metric_group_name" ] query = SQLQuery() for header in headers: query.addSelect(header) query.addFrom(self.sourceTableName) if userSIDs is not None: query.addWhereIn("user_id", userSIDs) query.setLimit(limit) query.setOffset(offset) # Query to get an estimate of how long the process will be if progress is not None: progress.total = DBUtil.execute(query.totalQuery(), conn=conn)[0][0] cursor = conn.cursor() # Do one massive query, but yield data for one item at a time. cursor.execute(str(query), tuple(query.params)) row = cursor.fetchone() while row is not None: rowModel = RowItemModel(row, headers) yield rowModel row = cursor.fetchone() # Slight risk here. Normally DB connection closing should be in finally of a try block, # but using the "yield" generator construct forbids us from using a try, finally construct. cursor.close() if not extConn: conn.close()
def _query_patient_episodes(self): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': query = SQLQuery() ''' pat_id: hashed patient id ''' query.addSelect('CAST(pat_id AS BIGINT) as pat_id') ''' order_proc_id: unique identifier for an episode ''' query.addSelect('sop.order_proc_id') ''' self._varTypeInTable: usually proc_code or base_name, the column of the lab to be queried ''' query.addSelect(self._varTypeInTable) ''' order_time: The time of the order. Note that sor table does not have this info. ''' query.addSelect('order_time') ''' y-labels related columns, choose one to predict (for now, use all_components_normal to predict). ''' if self._isLabPanel: query.addSelect( "CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel" ) # query.addSelect( "SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components" ) # sx query.addSelect( "SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components" ) # sx query.addSelect( "CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal" ) # sx else: query.addSelect( "CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 0 ELSE 1 END AS component_normal" ) ''' Relevant tables. Note that sor table does not have patient_id info; need to join sop to obtain it. ''' query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') ''' Condition: self._time_limit[0] < order_time < self._time_limit[1] ''' if self._time_limit: if self._time_limit[0]: query.addWhere("sop.order_time > '%s'" % self._time_limit[0]) if self._time_limit[1]: query.addWhere("sop.order_time < '%s'" % self._time_limit[1]) query.addWhere( "(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)" ) query.addWhereIn(self._varTypeInTable, [self._lab_var]) # sx query.addWhereIn("pat_id", random_patient_list) query.addGroupBy('pat_id') query.addGroupBy('sop.order_proc_id') query.addGroupBy(self._varTypeInTable) query.addGroupBy('order_time') if not self._isLabPanel: query.addGroupBy('result_flag') query.addGroupBy('abnormal_yn') # query.addOrderBy('pat_id') query.addOrderBy('sop.order_proc_id') query.addOrderBy(self._varTypeInTable) query.addOrderBy('order_time') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query, index_time_col='order_time') else: ''' Sqlite3 has an interesting limit for the total number of place_holders in a query, and this limit varies across platforms/operating systems (500-99999 on mac, 999 by defaulty). To avoid this problem when querying 1000-10000 patient ids, use string queries instead of the default (convenient) routine in DBUtil. ''' query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id, order_proc_id, %s, order_time, " % self._varTypeInTable if not self._isLabPanel: query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal " else: query_str += "CAST(SUM(CASE WHEN result_in_range_yn != 'Y' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal " query_str += "FROM labs " query_str += "WHERE %s = '%s' " % (self._varTypeInTable, self._lab_var) query_str += "AND pat_id IN " pat_list_str = "(" for pat_id in random_patient_list: pat_list_str += str(pat_id) + "," pat_list_str = pat_list_str[:-1] + ") " query_str += pat_list_str query_str += "GROUP BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable query_str += "ORDER BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable query_str += "LIMIT %d" % self._num_requested_episodes self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query_str, index_time_col='order_time')
def _query_patient_episodes(self, pat_id_col="pat_deid"): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() print("RANDOM PATIENT LIST", random_patient_list) # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. query = SQLQuery() #query.addSelect('CAST(micro.pat_deid AS BIGINT)') query.addSelect('CAST(micro.order_time AS TIMESTAMP)') #query.addSelect('sop.order_proc_id AS order_proc_id') query.addSelect('proc_code') query.addSelect('order_time') query.addSelect('organism_name') #one for the result # query.addSelect("CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel") # query.addSelect("SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components") # query.addSelect("SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components") # query.addSelect("CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal") # query.addFrom('stride_order_proc AS sop') # query.addFrom('stride_order_results AS sor') query.addFrom('micro') #query.addWhere('sop.order_proc_id = sor.order_proc_id') #query.addWhere("(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)") #query.addWhereIn("micro.proc_code", [self._lab_panel]) query.addWhereIn("micro.pat_deid", random_patient_list) query.addGroupBy('micro.pat_deid') #query.addGroupBy('sop.order_proc_id') query.addGroupBy('proc_code') query.addGroupBy('order_time') query.addGroupBy('organism_name') #query.addGroupBy('abnormal_yn') query.addOrderBy('micro.pat_deid') #query.addOrderBy('sop.order_proc_id') query.addOrderBy('proc_code') query.addOrderBy('micro.order_time') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes(self, query, index_time_col='order_time')
def _query_patient_episodes(self): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() #print("RANDOM PATIENT LIST", random_patient_list) # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. query = SQLQuery() query.addSelect('CAST(pat_anon_id AS BIGINT) AS pat_id') query.addSelect('CAST(shifted_order_time AS TIMESTAMP)') query.addSelect('stride_culture_micro.proc_code') query.addSelect('organism_name') #one for the result # Experimenting susceptibility_flags = [ 'Trimethoprim/Sulfamethoxazole', 'Vancomycin', 'Penicillin', 'Levofloxacin', 'Clindamycin', 'Ceftriaxone', 'Erythromycin', 'Ampicillin', 'Meropenem', 'Ciprofloxacin', 'Cefepime', 'Aztreonam.', 'Ampicillin/Sulbactam', 'Piperacillin/Tazobactam', 'Linezolid', 'Oxacillin.', 'Cefazolin', 'Daptomycin' ] for med in susceptibility_flags: med_col = med.replace('/', '_').replace('.', '') query.addSelect( "MAX(CASE WHEN antibiotic_name = '%s' AND (suseptibility = 'Susceptible' OR suseptibility = 'Positive') THEN 1 ELSE 0 END) AS %s_Susc" % (med, med_col)) query.addSelect( "MAX(CASE WHEN antibiotic_name = '%s' THEN 1 ELSE 0 END) as %s_tested" % (med, med_col)) # Let us look at top 10 commonly occuring bacteria query.addSelect( "CASE WHEN organism_name IS NULL THEN 0 ELSE 1 END AS BACTERIA_PRESENT" ) query.addSelect( "CASE WHEN organism_name = 'ESCHERICHIA COLI' THEN 1 ELSE 0 END AS ESCHERICHIA_COLI" ) query.addSelect( "CASE WHEN organism_name = 'STAPHYLOCOCCUS AUREUS' THEN 1 ELSE 0 END AS STAPHYLOCOCCUS_AUREUS" ) query.addSelect( "CASE WHEN organism_name = 'ENTEROCOCCUS SPECIES' THEN 1 ELSE 0 END AS ENTEROCOCCUS_SPECIES" ) query.addSelect( "CASE WHEN organism_name = 'KLEBSIELLA PNEUMONIAE' THEN 1 ELSE 0 END AS KLEBSIELLA_PNEUMONIAE" ) query.addSelect( "CASE WHEN organism_name = 'PSEUDOMONAS AERUGINOSA' THEN 1 ELSE 0 END AS PSEUDOMONAS_AERUGINOSA" ) query.addSelect( "CASE WHEN organism_name = 'COAG NEGATIVE STAPHYLOCOCCUS' THEN 1 ELSE 0 END AS COAG_NEGATIVE_STAPHYLOCOCCUS" ) query.addSelect( "CASE WHEN organism_name = 'ENTEROCOCCUS FAECALIS' THEN 1 ELSE 0 END AS ENTEROCOCCUS_FAECALIS" ) query.addSelect( "CASE WHEN organism_name = 'PROTEUS MIRABILIS' THEN 1 ELSE 0 END AS PROTEUS_MIRABILIS" ) query.addSelect( "CASE WHEN organism_name = 'CANDIDA ALBICANS' THEN 1 ELSE 0 END AS CANDIDA_ALBICANS" ) query.addFrom('stride_culture_micro') query.addWhereIn("stride_culture_micro.proc_code", self._lab_panel) query.addWhereIn("pat_anon_id", random_patient_list) query.addGroupBy('pat_anon_id') query.addGroupBy('shifted_order_time') query.addGroupBy('stride_culture_micro.proc_code') query.addGroupBy('organism_name') query.addOrderBy('pat_anon_id') query.addOrderBy('shifted_order_time') query.addOrderBy('stride_culture_micro.proc_code') query.addOrderBy('organism_name') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query, pat_id_col='pat_id', index_time_col='shifted_order_time')
def _query_patient_episodes(self): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() #print("RANDOM PATIENT LIST", random_patient_list) # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. query = SQLQuery() query.addSelect('CAST(pat_anon_id AS BIGINT) AS pat_id') query.addSelect('CAST(shifted_order_time AS TIMESTAMP)') query.addSelect('stride_culture_micro.proc_code') query.addSelect('organism_name') #one for the result # Let us look at top 10 commonly occuring bacteria query.addSelect( "CASE WHEN organism_name IS NULL THEN 1 ELSE 0 END AS NO_BACTERIA") query.addSelect( "CASE WHEN organism_name = 'ESCHERICHIA COLI' THEN 1 ELSE 0 END AS ESCHERICHIA_COLI" ) query.addSelect( "CASE WHEN organism_name = 'STAPHYLOCOCCUS AUREUS' THEN 1 ELSE 0 END AS STAPHYLOCOCCUS_AUREUS" ) query.addSelect( "CASE WHEN organism_name = 'ENTEROCOCCUS SPECIES' THEN 1 ELSE 0 END AS ENTEROCOCCUS_SPECIES" ) query.addSelect( "CASE WHEN organism_name = 'KLEBSIELLA PNEUMONIAE' THEN 1 ELSE 0 END AS KLEBSIELLA_PNEUMONIAE" ) query.addSelect( "CASE WHEN organism_name = 'PSEUDOMONAS AERUGINOSA' THEN 1 ELSE 0 END AS PSEUDOMONAS_AERUGINOSA" ) query.addSelect( "CASE WHEN organism_name = 'COAG NEGATIVE STAPHYLOCOCCUS' THEN 1 ELSE 0 END AS COAG_NEGATIVE_STAPHYLOCOCCUS" ) query.addSelect( "CASE WHEN organism_name = 'ENTEROCOCCUS FAECALIS' THEN 1 ELSE 0 END AS ENTEROCOCCUS_FAECALIS" ) query.addSelect( "CASE WHEN organism_name = 'PROTEUS MIRABILIS' THEN 1 ELSE 0 END AS PROTEUS_MIRABILIS" ) query.addSelect( "CASE WHEN organism_name = 'CANDIDA ALBICANS' THEN 1 ELSE 0 END AS CANDIDA_ALBICANS" ) query.addFrom('stride_culture_micro') query.addWhereIn("stride_culture_micro.proc_code", [self._lab_panel]) query.addWhereIn("pat_id", random_patient_list) query.addGroupBy('pat_id') query.addGroupBy('shifted_order_time') query.addGroupBy('stride_culture_micro.proc_code') query.addGroupBy('organism_name') query.addOrderBy('pat_id') query.addOrderBy('shifted_order_time') query.addOrderBy('stride_culture_micro.proc_code') query.addOrderBy('organism_name') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query, pat_id_col='pat_id', index_time_col='shifted_order_time')
def _query_patient_episodes(self): log.info('Querying patient episodes...') # Initialize DB cursor. cursor = self._connection.cursor() # Build parameters for query. self._lab_components = self._get_components_in_lab_panel() random_patient_list = self._get_random_patient_list() self.usedPatIds = random_patient_list[:] # Build SQL query for list of patient episodes. # Note that for 2008-2014 data, result_flag can take on any of the # following values: High, Low, High Panic, Low Panic, # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, * # (NONE): Only 27 lab components can have this flag. None has this # value for more than 5 results, so ignore it. # *: Only 10 lab components can have this flag. Only 6 have it for # >10 tests, and each of them is a microbiology test for which # a flag is less meaningful, e.g. Gram Stain, AFB culture. # Susceptible: Only 15 lab components can have this flag. All of those # only have this value for 2 results, so ignore it. # Resistant: Only 1 lab component can have this flag. Only two results # have this value, so ignore it. # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD) # have this value for thousands of results, so include it. # Negative: Only 10 lab components can have this flag, and all for # less than 5 results, so ignore it. # Positive: Only 3 lab components can have this flag, and all for # only 1 result, so ignore it. # Low Off-Scale: Only 1 lab component can have this flag, and only for # 3 results, so ignore it. # Low Panic: 1401 lab components can have this flag, many core # metabolic components. Include it. # High Panic: 8084 lab components can have this flag, many core # metabolic components. Include it. if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # TODO: component query = SQLQuery() query.addSelect('CAST(pat_id AS BIGINT)') query.addSelect('sop.order_proc_id AS order_proc_id') query.addSelect('proc_code') #TODO:sx query.addSelect('order_time') query.addSelect( "CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel" ) #sx query.addSelect( "SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components" ) #sx query.addSelect( "SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components" ) #sx query.addSelect( "CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal" ) #sx query.addFrom('stride_order_proc AS sop') #sx query.addFrom('stride_order_results AS sor') #sx query.addWhere('sop.order_proc_id = sor.order_proc_id') #sx query.addWhere( "(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)" ) query.addWhereIn("proc_code", [self._lab_var]) #sx query.addWhereIn("pat_id", random_patient_list) query.addGroupBy('pat_id') query.addGroupBy('sop.order_proc_id') #sx query.addGroupBy('proc_code') #sx query.addGroupBy('order_time') query.addGroupBy('abnormal_yn') #sx query.addOrderBy('pat_id') query.addOrderBy('sop.order_proc_id') #sx query.addOrderBy('proc_code') #sx query.addOrderBy('order_time') query.setLimit(self._num_requested_episodes) self._num_reported_episodes = FeatureMatrix._query_patient_episodes( self, query, index_time_col='order_time') elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': # query = SQLQuery() # query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') # query.addSelect('order_proc_id') # query.addSelect(self._varTypeInTable) # query.addSelect('order_time') # # query.addWhereIn(self._varTypeInTable, [self._lab_var]) # # if self._isLabPanel: # # query.addSelect("SUM(CASE WHEN result_in_range_yn IN ('N', 'Y') THEN 1 ELSE 0 END) AS num_components") # query.addSelect("SUM(CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END) AS num_normal_components") # query.addSelect("CAST(SUM(CASE WHEN result_in_range_yn = 'N' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal") #TODO # else: # query.addSelect("CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal") # # query.addFrom('labs') # query.addWhereIn("pat_id", random_patient_list) # # query.addGroupBy('pat_id') # query.addGroupBy('order_proc_id') # query.addGroupBy(self._varTypeInTable) # query.addGroupBy('order_time') # # query.addOrderBy('pat_id') # query.addOrderBy('order_proc_id') # query.addOrderBy(self._varTypeInTable) # query.addOrderBy('order_time') # # query.setLimit(self._num_requested_episodes) # # print query query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id, order_proc_id, %s, order_time, " % self._varTypeInTable if not self._isLabPanel: query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal " else: query_str += "CAST(SUM(CASE WHEN result_in_range_yn != 'Y' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal " # query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal " query_str += "FROM labs " query_str += "WHERE %s = '%s' " % (self._varTypeInTable, self._lab_var) query_str += "AND pat_id IN " pat_list_str = "(" for pat_id in random_patient_list: pat_list_str += str(pat_id) + "," pat_list_str = pat_list_str[:-1] + ") " query_str += pat_list_str query_str += "GROUP BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable query_str += "ORDER BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable query_str += "LIMIT %d" % self._num_requested_episodes self._num_reported_episodes = FeatureMatrix._querystr_patient_episodes( self, query_str, index_time_col='order_time')
def _get_random_patient_list(self): # Initialize DB cursor. cursor = self._connection.cursor() if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # Get average number of results for this lab test per patient. avg_orders_per_patient = self._get_average_orders_per_patient() log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Get numPatientsToQuery random patients who have gotten test. # TODO(sbala): Have option to feed in a seed for the randomness. query = SQLQuery() query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') query.addFrom('stride_order_proc AS sop') query.addWhereIn('proc_code', [self._lab_var]) # TODO: components query.addOrderBy('RANDOM()') query.setLimit(self._num_patients) log.debug('Querying random patient list...') results = DBUtil.execute(query) # Get patient list. random_patient_list = [row[0] for row in results] return random_patient_list elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': # Get average number of results for this lab test per patient. # query = SQLQuery() # query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') # query.addSelect('COUNT(order_proc_id) AS num_orders') # query.addFrom('labs') # query.addWhereIn(self._varTypeInTable, [self._lab_var]) # components = self._get_components_in_lab_panel() # query.addWhereIn("base_name", components) # # if self.notUsePatIds: # query.addWhereNotIn("pat_id", self.notUsePatIds) # # query.addGroupBy('pat_id') query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id , " query_str += "COUNT(order_proc_id) AS num_orders " query_str += "FROM labs " #query_str += " WHERE %s IN (%s) "%(self._varTypeInTable, self._lab_var) query_str += "WHERE %s = '%s' " % (self._varTypeInTable, self._lab_var) if self.notUsePatIds: query_str += "AND pat_id NOT IN (" for pat_id in self.notUsePatIds: query_str += "%s," % pat_id query_str = query_str[:-1] + ") " # get rid of comma query_str += "GROUP BY pat_id" log.debug('Querying median orders per patient...') results = DBUtil.execute(query_str) order_counts = [row[1] for row in results] if len(results) == 0: error_msg = '0 orders for order "%s."' % self._lab_var #sx log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results), self._num_patients]) # inds_random_patients = numpy.random.choice( len(results), size=patient_number_chosen, replace=False) # print 'inds_random_patients:', inds_random_patients pat_IDs_random_patients = [] for ind in inds_random_patients: pat_IDs_random_patients.append(results[ind][0]) return pat_IDs_random_patients