Esempio n. 1
0
    def _get_random_patient_list(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        avg_orders_per_patient = self._get_average_orders_per_patient()
        log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
        # Based on average # of results, figure out how many patients we'd
        # need to get for a feature matrix of requested size.
        self._num_patients = int(numpy.max([self._num_requested_episodes / \
            avg_orders_per_patient, 1]))

        # Get numPatientsToQuery random patients who have gotten test.
        # TODO(sbala): Have option to feed in a seed for the randomness.
        query = SQLQuery()
        query.addSelect('pat_id')
        query.addFrom('stride_order_proc AS sop')
        query.addWhereIn('proc_code', [self._lab_panel])
        query.addOrderBy('RANDOM()')
        query.setLimit(self._num_patients)
        log.debug('Querying random patient list...')
        results = DBUtil.execute(query)

        # Get patient list.
        random_patient_list = [ row[0] for row in results ]

        return random_patient_list
Esempio n. 2
0
    def querySourceItems(self,
                         userSIDs,
                         limit=None,
                         offset=None,
                         progress=None,
                         conn=None):
        """Query the database for list of all AccessLogs
        and yield the results one at a time.  If userSIDs provided, only return items matching those IDs.
        """
        extConn = conn is not None
        if not extConn:
            conn = self.connFactory.connection()

        # Column headers to query for that map to respective fields in analysis table
        headers = [
            "user_id", "user_name", "de_pat_id", "access_datetime",
            "metric_id", "metric_name", "line_count", "description",
            "metric_group_num", "metric_group_name"
        ]

        query = SQLQuery()
        for header in headers:
            query.addSelect(header)
        query.addFrom(self.sourceTableName)
        if userSIDs is not None:
            query.addWhereIn("user_id", userSIDs)
        query.setLimit(limit)
        query.setOffset(offset)

        # Query to get an estimate of how long the process will be
        if progress is not None:
            progress.total = DBUtil.execute(query.totalQuery(),
                                            conn=conn)[0][0]

        cursor = conn.cursor()
        # Do one massive query, but yield data for one item at a time.
        cursor.execute(str(query), tuple(query.params))

        row = cursor.fetchone()
        while row is not None:
            rowModel = RowItemModel(row, headers)
            yield rowModel
            row = cursor.fetchone()

        # Slight risk here.  Normally DB connection closing should be in finally of a try block,
        #   but using the "yield" generator construct forbids us from using a try, finally construct.
        cursor.close()

        if not extConn:
            conn.close()
Esempio n. 3
0
    def _query_patient_episodes(self):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':

            query = SQLQuery()
            '''
            pat_id: hashed patient id
            '''
            query.addSelect('CAST(pat_id AS BIGINT) as pat_id')
            '''
            order_proc_id: unique identifier for an episode
            '''
            query.addSelect('sop.order_proc_id')
            '''
            self._varTypeInTable: usually proc_code or base_name, the column of the lab to be queried
            '''
            query.addSelect(self._varTypeInTable)
            '''
            order_time: The time of the order. Note that sor table does not have this info. 
            '''
            query.addSelect('order_time')
            '''
            y-labels related columns, choose one to predict (for now, use all_components_normal to predict). 
            '''
            if self._isLabPanel:
                query.addSelect(
                    "CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel"
                )  #
                query.addSelect(
                    "SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components"
                )  # sx
                query.addSelect(
                    "SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components"
                )  # sx
                query.addSelect(
                    "CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal"
                )  # sx
            else:
                query.addSelect(
                    "CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 0 ELSE 1 END AS component_normal"
                )
            '''
            Relevant tables. Note that sor table does not have patient_id info; need to join sop to obtain it.  
            '''
            query.addFrom('stride_order_proc AS sop')
            query.addFrom('stride_order_results AS sor')
            query.addWhere('sop.order_proc_id = sor.order_proc_id')
            '''
            Condition: self._time_limit[0] < order_time < self._time_limit[1]
            '''
            if self._time_limit:
                if self._time_limit[0]:
                    query.addWhere("sop.order_time > '%s'" %
                                   self._time_limit[0])
                if self._time_limit[1]:
                    query.addWhere("sop.order_time < '%s'" %
                                   self._time_limit[1])

            query.addWhere(
                "(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)"
            )
            query.addWhereIn(self._varTypeInTable, [self._lab_var])  # sx
            query.addWhereIn("pat_id", random_patient_list)

            query.addGroupBy('pat_id')
            query.addGroupBy('sop.order_proc_id')
            query.addGroupBy(self._varTypeInTable)
            query.addGroupBy('order_time')
            if not self._isLabPanel:
                query.addGroupBy('result_flag')
            query.addGroupBy('abnormal_yn')  #

            query.addOrderBy('pat_id')
            query.addOrderBy('sop.order_proc_id')
            query.addOrderBy(self._varTypeInTable)
            query.addOrderBy('order_time')

            query.setLimit(self._num_requested_episodes)

            self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
                self, query, index_time_col='order_time')

        else:
            '''
            Sqlite3 has an interesting limit for the total number of place_holders in a query, 
            and this limit varies across platforms/operating systems (500-99999 on mac, 999 by defaulty). 
            
            To avoid this problem when querying 1000-10000 patient ids, use string queries instead of the
            default (convenient) routine in DBUtil.  
            '''

            query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id, order_proc_id, %s, order_time, " % self._varTypeInTable
            if not self._isLabPanel:
                query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal "
            else:
                query_str += "CAST(SUM(CASE WHEN result_in_range_yn != 'Y' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal "
            query_str += "FROM labs "
            query_str += "WHERE %s = '%s' " % (self._varTypeInTable,
                                               self._lab_var)
            query_str += "AND pat_id IN "
            pat_list_str = "("
            for pat_id in random_patient_list:
                pat_list_str += str(pat_id) + ","
            pat_list_str = pat_list_str[:-1] + ") "
            query_str += pat_list_str
            query_str += "GROUP BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable
            query_str += "ORDER BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable
            query_str += "LIMIT %d" % self._num_requested_episodes

            self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
                self, query_str, index_time_col='order_time')
Esempio n. 4
0
    def _query_patient_episodes(self, pat_id_col="pat_deid"):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()

        print("RANDOM PATIENT LIST", random_patient_list)

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.
        query = SQLQuery()
        #query.addSelect('CAST(micro.pat_deid AS BIGINT)')
        query.addSelect('CAST(micro.order_time AS TIMESTAMP)')
        #query.addSelect('sop.order_proc_id AS order_proc_id')
        query.addSelect('proc_code')
        query.addSelect('order_time')
        query.addSelect('organism_name') #one for the result
        # query.addSelect("CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel")
        # query.addSelect("SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components")
        # query.addSelect("SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components")
        # query.addSelect("CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal")
        # query.addFrom('stride_order_proc AS sop')
        # query.addFrom('stride_order_results AS sor')
        query.addFrom('micro')
        #query.addWhere('sop.order_proc_id = sor.order_proc_id')
        #query.addWhere("(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)")
        #query.addWhereIn("micro.proc_code", [self._lab_panel])
        query.addWhereIn("micro.pat_deid", random_patient_list)
        query.addGroupBy('micro.pat_deid')
        #query.addGroupBy('sop.order_proc_id')
        query.addGroupBy('proc_code')
        query.addGroupBy('order_time')
        query.addGroupBy('organism_name')
        #query.addGroupBy('abnormal_yn')
        query.addOrderBy('micro.pat_deid')
        #query.addOrderBy('sop.order_proc_id')
        query.addOrderBy('proc_code')
        query.addOrderBy('micro.order_time')
        query.setLimit(self._num_requested_episodes)

        self._num_reported_episodes = FeatureMatrix._query_patient_episodes(self, query, index_time_col='order_time')
Esempio n. 5
0
    def _query_patient_episodes(self):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()

        #print("RANDOM PATIENT LIST", random_patient_list)

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.
        query = SQLQuery()
        query.addSelect('CAST(pat_anon_id AS BIGINT) AS pat_id')
        query.addSelect('CAST(shifted_order_time AS TIMESTAMP)')
        query.addSelect('stride_culture_micro.proc_code')
        query.addSelect('organism_name')  #one for the result

        # Experimenting
        susceptibility_flags = [
            'Trimethoprim/Sulfamethoxazole', 'Vancomycin', 'Penicillin',
            'Levofloxacin', 'Clindamycin', 'Ceftriaxone', 'Erythromycin',
            'Ampicillin', 'Meropenem', 'Ciprofloxacin', 'Cefepime',
            'Aztreonam.', 'Ampicillin/Sulbactam', 'Piperacillin/Tazobactam',
            'Linezolid', 'Oxacillin.', 'Cefazolin', 'Daptomycin'
        ]

        for med in susceptibility_flags:
            med_col = med.replace('/', '_').replace('.', '')
            query.addSelect(
                "MAX(CASE WHEN antibiotic_name = '%s' AND (suseptibility = 'Susceptible' OR suseptibility = 'Positive') THEN 1 ELSE 0 END) AS %s_Susc"
                % (med, med_col))
            query.addSelect(
                "MAX(CASE WHEN antibiotic_name = '%s' THEN 1 ELSE 0 END) as %s_tested"
                % (med, med_col))

        # Let us look at top 10 commonly occuring bacteria
        query.addSelect(
            "CASE WHEN organism_name IS NULL THEN 0 ELSE 1 END AS BACTERIA_PRESENT"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ESCHERICHIA COLI' THEN 1 ELSE 0 END AS ESCHERICHIA_COLI"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'STAPHYLOCOCCUS AUREUS' THEN 1 ELSE 0 END AS STAPHYLOCOCCUS_AUREUS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ENTEROCOCCUS SPECIES' THEN 1 ELSE 0 END AS ENTEROCOCCUS_SPECIES"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'KLEBSIELLA PNEUMONIAE' THEN 1 ELSE 0 END AS KLEBSIELLA_PNEUMONIAE"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'PSEUDOMONAS AERUGINOSA' THEN 1 ELSE 0 END AS PSEUDOMONAS_AERUGINOSA"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'COAG NEGATIVE STAPHYLOCOCCUS' THEN 1 ELSE 0 END AS COAG_NEGATIVE_STAPHYLOCOCCUS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ENTEROCOCCUS FAECALIS' THEN 1 ELSE 0 END AS ENTEROCOCCUS_FAECALIS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'PROTEUS MIRABILIS' THEN 1 ELSE 0 END AS PROTEUS_MIRABILIS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'CANDIDA ALBICANS' THEN 1 ELSE 0 END AS CANDIDA_ALBICANS"
        )

        query.addFrom('stride_culture_micro')

        query.addWhereIn("stride_culture_micro.proc_code", self._lab_panel)
        query.addWhereIn("pat_anon_id", random_patient_list)
        query.addGroupBy('pat_anon_id')
        query.addGroupBy('shifted_order_time')
        query.addGroupBy('stride_culture_micro.proc_code')
        query.addGroupBy('organism_name')
        query.addOrderBy('pat_anon_id')
        query.addOrderBy('shifted_order_time')
        query.addOrderBy('stride_culture_micro.proc_code')
        query.addOrderBy('organism_name')
        query.setLimit(self._num_requested_episodes)

        self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
            self,
            query,
            pat_id_col='pat_id',
            index_time_col='shifted_order_time')
Esempio n. 6
0
    def _query_patient_episodes(self):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()

        #print("RANDOM PATIENT LIST", random_patient_list)

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.
        query = SQLQuery()
        query.addSelect('CAST(pat_anon_id AS BIGINT) AS pat_id')
        query.addSelect('CAST(shifted_order_time AS TIMESTAMP)')
        query.addSelect('stride_culture_micro.proc_code')
        query.addSelect('organism_name')  #one for the result

        # Let us look at top 10 commonly occuring bacteria
        query.addSelect(
            "CASE WHEN organism_name IS NULL THEN 1 ELSE 0 END AS NO_BACTERIA")
        query.addSelect(
            "CASE WHEN organism_name = 'ESCHERICHIA COLI' THEN 1 ELSE 0 END AS ESCHERICHIA_COLI"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'STAPHYLOCOCCUS AUREUS' THEN 1 ELSE 0 END AS STAPHYLOCOCCUS_AUREUS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ENTEROCOCCUS SPECIES' THEN 1 ELSE 0 END AS ENTEROCOCCUS_SPECIES"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'KLEBSIELLA PNEUMONIAE' THEN 1 ELSE 0 END AS KLEBSIELLA_PNEUMONIAE"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'PSEUDOMONAS AERUGINOSA' THEN 1 ELSE 0 END AS PSEUDOMONAS_AERUGINOSA"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'COAG NEGATIVE STAPHYLOCOCCUS' THEN 1 ELSE 0 END AS COAG_NEGATIVE_STAPHYLOCOCCUS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ENTEROCOCCUS FAECALIS' THEN 1 ELSE 0 END AS ENTEROCOCCUS_FAECALIS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'PROTEUS MIRABILIS' THEN 1 ELSE 0 END AS PROTEUS_MIRABILIS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'CANDIDA ALBICANS' THEN 1 ELSE 0 END AS CANDIDA_ALBICANS"
        )

        query.addFrom('stride_culture_micro')

        query.addWhereIn("stride_culture_micro.proc_code", [self._lab_panel])
        query.addWhereIn("pat_id", random_patient_list)
        query.addGroupBy('pat_id')
        query.addGroupBy('shifted_order_time')
        query.addGroupBy('stride_culture_micro.proc_code')
        query.addGroupBy('organism_name')
        query.addOrderBy('pat_id')
        query.addOrderBy('shifted_order_time')
        query.addOrderBy('stride_culture_micro.proc_code')
        query.addOrderBy('organism_name')
        query.setLimit(self._num_requested_episodes)

        self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
            self,
            query,
            pat_id_col='pat_id',
            index_time_col='shifted_order_time')
Esempio n. 7
0
    def _query_patient_episodes(self):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()
        self.usedPatIds = random_patient_list[:]

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  # TODO: component
            query = SQLQuery()
            query.addSelect('CAST(pat_id AS BIGINT)')
            query.addSelect('sop.order_proc_id AS order_proc_id')
            query.addSelect('proc_code')  #TODO:sx
            query.addSelect('order_time')
            query.addSelect(
                "CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel"
            )  #sx
            query.addSelect(
                "SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components"
            )  #sx
            query.addSelect(
                "SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components"
            )  #sx
            query.addSelect(
                "CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal"
            )  #sx
            query.addFrom('stride_order_proc AS sop')  #sx
            query.addFrom('stride_order_results AS sor')  #sx
            query.addWhere('sop.order_proc_id = sor.order_proc_id')  #sx
            query.addWhere(
                "(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)"
            )
            query.addWhereIn("proc_code", [self._lab_var])  #sx
            query.addWhereIn("pat_id", random_patient_list)
            query.addGroupBy('pat_id')
            query.addGroupBy('sop.order_proc_id')  #sx
            query.addGroupBy('proc_code')  #sx
            query.addGroupBy('order_time')
            query.addGroupBy('abnormal_yn')  #sx
            query.addOrderBy('pat_id')
            query.addOrderBy('sop.order_proc_id')  #sx
            query.addOrderBy('proc_code')  #sx
            query.addOrderBy('order_time')
            query.setLimit(self._num_requested_episodes)

            self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
                self, query, index_time_col='order_time')

        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            # query = SQLQuery()
            # query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')
            # query.addSelect('order_proc_id')
            # query.addSelect(self._varTypeInTable)
            # query.addSelect('order_time')
            #
            # query.addWhereIn(self._varTypeInTable, [self._lab_var])
            #
            # if self._isLabPanel:
            #
            #     query.addSelect("SUM(CASE WHEN result_in_range_yn IN ('N', 'Y') THEN 1 ELSE 0 END) AS num_components")
            #     query.addSelect("SUM(CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END) AS num_normal_components")
            #     query.addSelect("CAST(SUM(CASE WHEN result_in_range_yn = 'N' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal") #TODO
            # else:
            #     query.addSelect("CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal")
            #
            # query.addFrom('labs')
            # query.addWhereIn("pat_id", random_patient_list)
            #
            # query.addGroupBy('pat_id')
            # query.addGroupBy('order_proc_id')
            # query.addGroupBy(self._varTypeInTable)
            # query.addGroupBy('order_time')
            #
            # query.addOrderBy('pat_id')
            # query.addOrderBy('order_proc_id')
            # query.addOrderBy(self._varTypeInTable)
            # query.addOrderBy('order_time')
            #
            # query.setLimit(self._num_requested_episodes)
            #
            # print query

            query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id, order_proc_id, %s, order_time, " % self._varTypeInTable
            if not self._isLabPanel:
                query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal "
            else:
                query_str += "CAST(SUM(CASE WHEN result_in_range_yn != 'Y' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal "
            # query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal "
            query_str += "FROM labs "
            query_str += "WHERE %s = '%s' " % (self._varTypeInTable,
                                               self._lab_var)
            query_str += "AND pat_id IN "
            pat_list_str = "("
            for pat_id in random_patient_list:
                pat_list_str += str(pat_id) + ","
            pat_list_str = pat_list_str[:-1] + ") "
            query_str += pat_list_str
            query_str += "GROUP BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable
            query_str += "ORDER BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable
            query_str += "LIMIT %d" % self._num_requested_episodes

            self._num_reported_episodes = FeatureMatrix._querystr_patient_episodes(
                self, query_str, index_time_col='order_time')
Esempio n. 8
0
    def _get_random_patient_list(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            # Get average number of results for this lab test per patient.
            avg_orders_per_patient = self._get_average_orders_per_patient()
            log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
            # Based on average # of results, figure out how many patients we'd
            # need to get for a feature matrix of requested size.
            self._num_patients = int(numpy.max([self._num_requested_episodes / \
                avg_orders_per_patient, 1]))

            # Get numPatientsToQuery random patients who have gotten test.
            # TODO(sbala): Have option to feed in a seed for the randomness.
            query = SQLQuery()
            query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')
            query.addFrom('stride_order_proc AS sop')
            query.addWhereIn('proc_code', [self._lab_var])  # TODO: components
            query.addOrderBy('RANDOM()')
            query.setLimit(self._num_patients)
            log.debug('Querying random patient list...')
            results = DBUtil.execute(query)

            # Get patient list.
            random_patient_list = [row[0] for row in results]

            return random_patient_list

        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':

            # Get average number of results for this lab test per patient.
            # query = SQLQuery()
            # query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')
            # query.addSelect('COUNT(order_proc_id) AS num_orders')
            # query.addFrom('labs')
            # query.addWhereIn(self._varTypeInTable, [self._lab_var])
            # components = self._get_components_in_lab_panel()
            # query.addWhereIn("base_name", components)
            #
            # if self.notUsePatIds:
            #     query.addWhereNotIn("pat_id", self.notUsePatIds)
            #
            # query.addGroupBy('pat_id')

            query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id , "
            query_str += "COUNT(order_proc_id) AS num_orders "
            query_str += "FROM labs "
            #query_str += " WHERE %s IN (%s) "%(self._varTypeInTable, self._lab_var)
            query_str += "WHERE %s = '%s' " % (self._varTypeInTable,
                                               self._lab_var)
            if self.notUsePatIds:
                query_str += "AND pat_id NOT IN ("
                for pat_id in self.notUsePatIds:
                    query_str += "%s," % pat_id
                query_str = query_str[:-1] + ") "  # get rid of comma
            query_str += "GROUP BY pat_id"

            log.debug('Querying median orders per patient...')

            results = DBUtil.execute(query_str)
            order_counts = [row[1] for row in results]

            if len(results) == 0:
                error_msg = '0 orders for order "%s."' % self._lab_var  #sx
                log.critical(error_msg)
                raise Exception(error_msg)
                # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception
            else:
                avg_orders_per_patient = numpy.median(order_counts)
                log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
                # Based on average # of results, figure out how many patients we'd
                # need to get for a feature matrix of requested size.
                self._num_patients = int(numpy.max([self._num_requested_episodes / \
                    avg_orders_per_patient, 1]))
                # Some components may have fewer associated patients than the required sample size
                patient_number_chosen = min([len(results),
                                             self._num_patients])  #
                inds_random_patients = numpy.random.choice(
                    len(results), size=patient_number_chosen, replace=False)
                # print 'inds_random_patients:', inds_random_patients
                pat_IDs_random_patients = []
                for ind in inds_random_patients:
                    pat_IDs_random_patients.append(results[ind][0])
                return pat_IDs_random_patients