Beispiel #1
0
    def queryItems(self, options, outputFile):
        """Query for all clinical item records that fulfill the options criteria
        and then send the results as tab-delimited output to the outputFile.
        """
        pauseSeconds = float(options.pauseSeconds)

        query = SQLQuery()
        query.addSelect("count(order_med_id_coded) as nOrders")
        query.addSelect("om.med_route, om.medication_id, om.med_description")
        query.addFrom("starr_datalake2018.order_med as om")
        if options.descriptionPrefix:
            query.addWhereOp("om.med_description", "like",
                             options.descriptionPrefix + "%%")
            # Add wildcard to enabe prefix search
        if options.medRoutes:
            query.addWhereIn("om.med_route", options.medRoutes.split(","))
        query.addGroupBy("om.medication_id, om.med_description, om.med_route")
        query.addOrderBy("nOrders desc, om.med_description")

        formatter = TextResultsFormatter(outputFile)

        prog = ProgressDots()
        for row in DBUtil.execute(query,
                                  includeColumnNames=True,
                                  connFactory=self.connFactory):
            formatter.formatTuple(row)
            time.sleep(pauseSeconds)
            prog.update()
        prog.printStatus()
Beispiel #2
0
    def _get_random_patient_list(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        avg_orders_per_patient = self._get_average_orders_per_patient()
        log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
        # Based on average # of results, figure out how many patients we'd
        # need to get for a feature matrix of requested size.
        self._num_patients = int(numpy.max([self._num_requested_episodes / \
            avg_orders_per_patient, 1]))

        # Get numPatientsToQuery random patients who have gotten test.
        # TODO(sbala): Have option to feed in a seed for the randomness.
        query = SQLQuery()
        query.addSelect('pat_id')
        query.addFrom('stride_order_proc AS sop')
        query.addWhereIn('proc_code', self._lab_panel)
        query.addGroupBy('pat_id')  # this should be a unique list of patients
        query.addOrderBy('RANDOM()')
        query.setLimit(self._num_patients)
        log.debug('Querying random patient list...')
        results = DBUtil.execute(query)

        # Get patient list.
        random_patient_list = [row[0] for row in results]

        return random_patient_list
Beispiel #3
0
    def _get_average_orders_per_patient(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        query = SQLQuery()
        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  #TODO: add STRIDE component routine
            query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')
            query.addSelect('COUNT(sop.order_proc_id) AS num_orders')
            query.addFrom('stride_order_proc AS sop')
            query.addFrom('stride_order_results AS sor')
            query.addWhere('sop.order_proc_id = sor.order_proc_id')
            query.addWhereIn("proc_code", [self._lab_panel])
            components = self._get_components_in_lab_panel()
            query.addWhereIn("base_name", components)
            query.addGroupBy('pat_id')

        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')
            query.addSelect('COUNT(order_proc_id) AS num_orders')
            query.addFrom('labs')
            query.addWhereIn(self._varTypeInTable, [self._lab_var])
            components = self._get_components_in_lab_panel()
            query.addWhereIn("base_name", components)
            query.addGroupBy('pat_id')
        log.debug('Querying median orders per patient...')
        results = DBUtil.execute(query)
        order_counts = [row[1] for row in results]
        if len(order_counts) == 0:
            error_msg = '0 orders for lab "%s."' % self._lab_var
            log.critical(error_msg)
            raise Exception(error_msg)
            # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception
        else:
            return numpy.median(order_counts)
Beispiel #4
0
    def _get_components_in_lab_panel(self):
        # Initialize DB connection.
        cursor = self._connection.cursor()

        # Doing a single query results in a sequential scan through
        # stride_order_results. To avoid this, break up the query in two.

        # First, get all the order_proc_ids for proc_code.

        query = SQLQuery()
        query.addSelect('order_proc_id')
        query.addFrom('stride_order_proc')
        query.addWhereIn('proc_code', [self._lab_panel])
        query.addGroupBy('order_proc_id')
        log.debug('Querying order_proc_ids for %s...' % self._lab_panel)
        results = DBUtil.execute(query)
        lab_order_ids = [row[0] for row in results]

        # Second, get all base_names from those orders.
        query = SQLQuery()
        query.addSelect('base_name')
        query.addFrom('stride_order_results')
        query.addWhereIn('order_proc_id', lab_order_ids)
        query.addGroupBy('base_name')
        log.debug('Querying base_names for order_proc_ids...')
        results = DBUtil.execute(query)
        components = [row[0] for row in results]

        return components
Beispiel #5
0
def queryPatients(period, locations, rxCount):
    log.info(
        "Select patients fitting criteria in designated time period: (%s,%s)" %
        period)

    query = SQLQuery()
    query.addSelect("med.pat_id")
    query.addSelect("count(order_med_id)")
    query.addFrom("stride_mapped_meds as map")
    query.addFrom("stride_order_med as med")
    query.addFrom("stride_patient as pat")
    query.addWhere("analysis_status = 1")
    query.addWhere("map.medication_id = med.medication_id")
    query.addWhere("med.pat_id = pat.pat_id")
    query.addWhere("possible_oncology = 0")
    query.addWhereIn("patient_location", locations)
    query.addWhereOp("ordering_datetime", ">", period[0])
    query.addWhereOp("ordering_datetime", "<", period[-1])
    query.addGroupBy("med.pat_id")
    query.addHaving("count(order_med_id) >2")

    results = DBUtil.execute(query)
    cols = ["patientId", "nOpioidRx"]
    patientDF = pd.DataFrame(results, columns=cols)
    #patientDF.set_index("patientId",drop=False,inplace=True);

    patientDF["periodStart"] = period[0]
    # Identify this group of patient records

    return patientDF
    def test_buildFeatureMatrix_prePostFeatures(self):
        """
        Test features parameter in addClinicalItemFeatures which allows
        client to specify they only want .pre* or .post* columns in feature
        matrix.
        """
        # Verify FeatureMatrixFactory throws Error if patientEpisodeInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientEpisodeInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()
        resultEpisodeIterator = self.factory.getPatientEpisodeIterator()
        resultPatientEpisodes = list()
        for episode in resultEpisodeIterator:
            episode["pat_id"] = int(episode["pat_id"])
            episode["order_time"] = DBUtil.parseDateValue(
                episode["order_time"])
            resultPatientEpisodes.append(episode)

        # Add TestItem100 and TestItem200 clinical item data.
        self.factory.addClinicalItemFeatures(["TestItem100"], features="pre")
        self.factory.addClinicalItemFeatures(["TestItem200"], features="post")
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()
        expectedMatrix = FM_TEST_OUTPUT[
            "test_buildFeatureMatrix_prePostFeatures"]

        self.assertEqualList(resultMatrix[2:], expectedMatrix)
    def test_addTimeCycleFeatures(self):
        """
        Test .addTimeCycleFeatures()
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Add time cycle features.
        self.factory.addTimeCycleFeatures("order_time", "month")
        self.factory.addTimeCycleFeatures("order_time", "hour")

        # Verify output.
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()
        expectedMatrix = FM_TEST_OUTPUT["test_addTimeCycleFeatures"][
            "expectedMatrix"]
        self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5)

        # Clean up feature matrix.
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
Beispiel #8
0
	def _getClinicalItemCounts(self):
		query = SQLQuery()
		# SELECT
		query.addSelect(CLINICAL_ITEM_ID)
		query.addSelect('COUNT(' + CLINICAL_ITEM_ID + ') as total')
		# FROM
		query.addFrom('patient_item')
		# OTHER
		query.addGroupBy(CLINICAL_ITEM_ID)
		query.addOrderBy('total', dir='desc')

		print(query)
		print(query.getParams())
		DBUtil.runDBScript(self.SCRIPT_FILE, False)
		results = DBUtil.execute(str(query), query.getParams())

		pd.DataFrame(results, columns=[CLINICAL_ITEM_ID, 'count']).to_csv('data_summary_stats/item_counts.csv', index=False)
Beispiel #9
0
    def _getNonNullLabs(self):
        query = SQLQuery()
        # SELECT
        query.addSelect('proc_code')
        # FROM
        query.addFrom('stride_order_proc')
        # WHERE
        query.addWhereLike('proc_code', 'LAB%')
        query.addWhere('abnormal_yn is not null')

        query.addGroupBy('proc_code')
        query.addOrderBy('proc_code')

        results = DBUtil.execute(query)

        df = pd.DataFrame(results, columns=query.select).to_csv(
            DATA_FOLDER + 'proc_codes.csv', index=False)
Beispiel #10
0
def add_sim_case_column_to(csv):
    query = SQLQuery()
    query.addSelect("sim_patient_id")
    query.addSelect("sim_case_name as sim_case")
    query.addFrom("sim_grading_key sgk")
    query.addJoin(
        "sim_patient_order spo",
        "sgk.clinical_item_id = spo.clinical_item_id and spo.sim_state_id = sgk.sim_state_id"
    )
    query.addGroupBy("sim_patient_id")
    query.addGroupBy("sim_case_name")
    case_names = DBUtil.execute(str(query))

    # merge sim_case column
    csv = pd.merge(csv,
                   pd.DataFrame(case_names,
                                columns=['sim_patient_id', 'sim_case']),
                   left_on='patient',
                   right_on='sim_patient_id')
    return csv
Beispiel #11
0
    def _get_random_patient_list(self):
        #sx: this function is for avoid RANDOM() on the database
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        query = SQLQuery()
        query.addSelect('pat_id')
        query.addSelect('COUNT(sop.order_proc_id) AS num_orders')
        query.addFrom('stride_order_proc AS sop')
        query.addFrom('stride_order_results AS sor')
        query.addWhere('sop.order_proc_id = sor.order_proc_id')
        ##
        query.addWhereIn("base_name", [self._component])
        query.addGroupBy('pat_id')
        log.debug('Querying median orders per patient...')

        results = DBUtil.execute(query)

        order_counts = [ row[1] for row in results ]


        if len(results) == 0:
            error_msg = '0 orders for component "%s."' % self._component #sx
            log.critical(error_msg)
            sys.exit('[ERROR] %s' % error_msg)
        else:
            avg_orders_per_patient = numpy.median(order_counts)
            log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
            # Based on average # of results, figure out how many patients we'd
            # need to get for a feature matrix of requested size.
            self._num_patients = int(numpy.max([self._num_requested_episodes / \
                avg_orders_per_patient, 1]))
            # Some components may have fewer associated patients than the required sample size
            patient_number_chosen = min([len(results),self._num_patients]) #
            inds_random_patients = numpy.random.choice(len(results), size=patient_number_chosen, replace=False)
            # print 'inds_random_patients:', inds_random_patients
            pat_IDs_random_patients = []
            for ind in inds_random_patients:
                pat_IDs_random_patients.append(results[ind][0])
            # print pat_IDs_random_patients
            return pat_IDs_random_patients
Beispiel #12
0
def queryDrugScreens( patientDF, period, locations ):
    log.info("Populate drug screens by primary locations");

    query = SQLQuery();
    query.addSelect("pat_id");
    query.addSelect("count(distinct order_proc_id)");
    query.addFrom("stride_order_proc_drug_screen");
    query.addWhere("ordering_mode = 'Outpatient'");
    query.addWhereIn("patient_location", locations );
    query.addWhereOp("ordering_date",">", period[0]);
    query.addWhereOp("ordering_date","<", period[-1]);
    query.addWhereIn("pat_id", patientDF["patientId"] );
    query.addGroupBy("pat_id");

    results = DBUtil.execute(query);
    cols = ["patientId","nDrugScreens"];
    newDF = pd.DataFrame(results,columns=cols);
    patientDF = patientDF.merge(newDF, how="left");
    patientDF["nDrugScreens"][np.isnan(patientDF["nDrugScreens"])] = 0;    # Populate default values if no data
    patientDF["nDrugScreens"] = patientDF["nDrugScreens"].astype("int");    # Beware of float conversion somewhere
    return patientDF;
Beispiel #13
0
    def _getNumPatientEncountersByMonth(self):
        # Build query.
        # SELECT
        #   CAST(EXTRACT(YEAR FROM noted_date) AS INT) AS admit_year,
        #   CAST(EXTRACT(MONTH FROM noted_date) AS INT) AS admit_month
        #   COUNT(DISTINCT pat_enc_csn_id) AS num_encounters,
        # FROM
        #   stride_dx_list
        # WHERE
        #   data_source = 'ADMIT_DX'
        # GROUP BY
        #   admit_year,
        #   admit_month
        # ORDER BY
        #   admit_year,
        #   admit_month
        query = SQLQuery()
        query.addSelect(
            "CAST(EXTRACT(YEAR FROM noted_date) AS INT) AS admit_year")
        query.addSelect(
            "CAST(EXTRACT(MONTH FROM noted_date) AS INT) AS admit_month")
        query.addSelect("COUNT(DISTINCT pat_enc_csn_id) AS num_encounters")
        query.addFrom("stride_dx_list")
        query.addWhereEqual("data_source", 'ADMIT_DX')
        query.addGroupBy("admit_year")
        query.addGroupBy("admit_month")
        query.addOrderBy("admit_year")
        query.addOrderBy("admit_month")

        # Execute query and return results.
        results = DBUtil.execute(query)
        encountersPerMonth = list()
        for row in results:
            admitYear = row[0]
            admitMonth = row[1]
            numEncounters = row[2]
            encountersPerMonth.append((admitYear, admitMonth, numEncounters))

        return encountersPerMonth
Beispiel #14
0
    def fetch_components_in_panel(lab_panel):
        # Doing a single query results in a sequential scan through
        # stride_order_results. To avoid this, break up the query in two.
        # First, get all the order_proc_ids for proc_code.
        query = SQLQuery()
        query.addSelect('order_proc_id')
        query.addFrom('stride_order_proc')
        query.addWhereIn('proc_code', [lab_panel])
        query.addGroupBy('order_proc_id')
        results = DBUtil.execute(query)
        lab_order_ids = [row[0] for row in results]

        # Second, get all base_names from those orders.
        query = SQLQuery()
        query.addSelect('base_name')
        query.addFrom('stride_order_results')
        query.addWhereIn('order_proc_id', lab_order_ids)
        query.addGroupBy('base_name')
        results = DBUtil.execute(query)
        components = [row[0] for row in results]

        return components
Beispiel #15
0
    def _get_average_orders_per_patient(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        query = SQLQuery()
        query.addSelect('pat_id')
        query.addSelect('COUNT(sop.order_proc_id) AS num_orders')
        query.addFrom('stride_order_proc AS sop')
        query.addFrom('stride_order_results AS sor')
        query.addWhere('sop.order_proc_id = sor.order_proc_id')
        query.addWhereIn("proc_code", [self._lab_panel])
        components = self._get_components_in_lab_panel()
        query.addWhereIn("base_name", components)
        query.addGroupBy('pat_id')
        log.debug('Querying median orders per patient...')
        results = DBUtil.execute(query)
        order_counts = [ row[1] for row in results ]
        if len(order_counts) == 0:
            error_msg = '0 orders for lab panel "%s."' % self._lab_panel
            log.critical(error_msg)
            sys.exit('[ERROR] %s' % error_msg)
        else:
            return numpy.median(order_counts)
Beispiel #16
0
    def _getAdmitDxPatientFrequencyRankByYear(self):
        # Get list of all clinical item IDs matching admit diagnosis.
        # Get this list in advance to make subsequent query run a bit faster.
        admitDxClinicalItemIds = self._getAdmitDxClinicalItemIds()

        # Build query for # of unique patients.
        # SELECT
        #   ci.name AS icd_code,
        #   ci.description AS admit_dx,
        #   EXTRACT(YEAR FROM pi.item_date) AS admit_year,
        #   COUNT(DISTINCT pi.patient_id) AS num_unique_patients,
        # FROM
        #   patient_item AS pi
        # JOIN
        #   clinical_item AS ci
        # ON
        #   pi.clinical_item_id = ci.clinical_item_id
        # WHERE
        #   ci.clinical_item_id in (admitDxClinicalItemIds)
        # GROUP BY
        #   icd_code,
        #   admit_dx,
        #   admit_year
        #   num_unique_patients
        # ORDER BY
        #   admit_year,
        #   num_unique_patients DESC
        query = SQLQuery()
        query.addSelect("ci.name AS icd_code")
        query.addSelect("ci.description AS admit_dx")
        query.addSelect("EXTRACT(YEAR FROM pi.item_date) AS admit_year")
        query.addSelect("COUNT(DISTINCT pi.patient_id) AS num_unique_patients")
        query.addFrom("patient_item AS pi")
        query.addJoin("clinical_item AS ci",
                      "pi.clinical_item_id = ci.clinical_item_id")
        query.addWhereIn("ci.clinical_item_id", admitDxClinicalItemIds)
        query.addGroupBy("icd_code")
        query.addGroupBy("admit_dx")
        query.addGroupBy("admit_year")
        query.addGroupBy("num_unique_patients")
        query.addOrderBy("icd_code")
        query.addOrderBy("admit_year")
        query.addOrderBy("num_unique_patients DESC")

        # Execute query.
        results = DBUtil.execute(query)
Beispiel #17
0
    def _get_components_in_lab_panel(self):
        if not self._isLabPanel:
            return [self._lab_var]

        # Initialize DB connection.
        cursor = self._connection.cursor()

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            # Doing a single query results in a sequential scan through
            # stride_order_results. To avoid this, break up the query in two.

            # First, get all the order_proc_ids for proc_code.

            query = SQLQuery()
            query.addSelect('order_proc_id')
            query.addFrom('stride_order_proc')
            query.addWhereIn('proc_code', [self._lab_var])
            query.addGroupBy('order_proc_id')
            log.debug('Querying order_proc_ids for %s...' % self._lab_var)
            results = DBUtil.execute(query)
            lab_order_ids = [row[0] for row in results]

            # Second, get all base_names from those orders.
            query = SQLQuery()
            query.addSelect('base_name')
            query.addFrom('stride_order_results')
            query.addWhereIn('order_proc_id', lab_order_ids)
            query.addGroupBy('base_name')

        else:
            # elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            query = SQLQuery()
            query.addSelect('base_name')
            query.addFrom('labs')
            query.addWhereIn('proc_code', [self._lab_var])
            query.addGroupBy('base_name')

        log.debug('Querying base_names for order_proc_ids...')
        results = DBUtil.execute(query)
        components = [row[0] for row in results]

        return components
Beispiel #18
0
    def _query_patient_episodes(self):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()

        #print("RANDOM PATIENT LIST", random_patient_list)

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.
        query = SQLQuery()
        query.addSelect('CAST(pat_anon_id AS BIGINT) AS pat_id')
        query.addSelect('CAST(shifted_order_time AS TIMESTAMP)')
        query.addSelect('stride_culture_micro.proc_code')
        query.addSelect('organism_name')  #one for the result

        # Let us look at top 10 commonly occuring bacteria
        query.addSelect(
            "CASE WHEN organism_name IS NULL THEN 1 ELSE 0 END AS NO_BACTERIA")
        query.addSelect(
            "CASE WHEN organism_name = 'ESCHERICHIA COLI' THEN 1 ELSE 0 END AS ESCHERICHIA_COLI"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'STAPHYLOCOCCUS AUREUS' THEN 1 ELSE 0 END AS STAPHYLOCOCCUS_AUREUS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ENTEROCOCCUS SPECIES' THEN 1 ELSE 0 END AS ENTEROCOCCUS_SPECIES"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'KLEBSIELLA PNEUMONIAE' THEN 1 ELSE 0 END AS KLEBSIELLA_PNEUMONIAE"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'PSEUDOMONAS AERUGINOSA' THEN 1 ELSE 0 END AS PSEUDOMONAS_AERUGINOSA"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'COAG NEGATIVE STAPHYLOCOCCUS' THEN 1 ELSE 0 END AS COAG_NEGATIVE_STAPHYLOCOCCUS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ENTEROCOCCUS FAECALIS' THEN 1 ELSE 0 END AS ENTEROCOCCUS_FAECALIS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'PROTEUS MIRABILIS' THEN 1 ELSE 0 END AS PROTEUS_MIRABILIS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'CANDIDA ALBICANS' THEN 1 ELSE 0 END AS CANDIDA_ALBICANS"
        )

        query.addFrom('stride_culture_micro')

        query.addWhereIn("stride_culture_micro.proc_code", [self._lab_panel])
        query.addWhereIn("pat_id", random_patient_list)
        query.addGroupBy('pat_id')
        query.addGroupBy('shifted_order_time')
        query.addGroupBy('stride_culture_micro.proc_code')
        query.addGroupBy('organism_name')
        query.addOrderBy('pat_id')
        query.addOrderBy('shifted_order_time')
        query.addOrderBy('stride_culture_micro.proc_code')
        query.addOrderBy('organism_name')
        query.setLimit(self._num_requested_episodes)

        self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
            self,
            query,
            pat_id_col='pat_id',
            index_time_col='shifted_order_time')
Beispiel #19
0
    def _query_patient_episodes(self):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':

            query = SQLQuery()
            '''
            pat_id: hashed patient id
            '''
            query.addSelect('CAST(pat_id AS BIGINT) as pat_id')
            '''
            order_proc_id: unique identifier for an episode
            '''
            query.addSelect('sop.order_proc_id')
            '''
            self._varTypeInTable: usually proc_code or base_name, the column of the lab to be queried
            '''
            query.addSelect(self._varTypeInTable)
            '''
            order_time: The time of the order. Note that sor table does not have this info. 
            '''
            query.addSelect('order_time')
            '''
            y-labels related columns, choose one to predict (for now, use all_components_normal to predict). 
            '''
            if self._isLabPanel:
                query.addSelect(
                    "CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel"
                )  #
                query.addSelect(
                    "SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components"
                )  # sx
                query.addSelect(
                    "SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components"
                )  # sx
                query.addSelect(
                    "CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal"
                )  # sx
            else:
                query.addSelect(
                    "CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 0 ELSE 1 END AS component_normal"
                )
            '''
            Relevant tables. Note that sor table does not have patient_id info; need to join sop to obtain it.  
            '''
            query.addFrom('stride_order_proc AS sop')
            query.addFrom('stride_order_results AS sor')
            query.addWhere('sop.order_proc_id = sor.order_proc_id')
            '''
            Condition: self._time_limit[0] < order_time < self._time_limit[1]
            '''
            if self._time_limit:
                if self._time_limit[0]:
                    query.addWhere("sop.order_time > '%s'" %
                                   self._time_limit[0])
                if self._time_limit[1]:
                    query.addWhere("sop.order_time < '%s'" %
                                   self._time_limit[1])

            query.addWhere(
                "(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)"
            )
            query.addWhereIn(self._varTypeInTable, [self._lab_var])  # sx
            query.addWhereIn("pat_id", random_patient_list)

            query.addGroupBy('pat_id')
            query.addGroupBy('sop.order_proc_id')
            query.addGroupBy(self._varTypeInTable)
            query.addGroupBy('order_time')
            if not self._isLabPanel:
                query.addGroupBy('result_flag')
            query.addGroupBy('abnormal_yn')  #

            query.addOrderBy('pat_id')
            query.addOrderBy('sop.order_proc_id')
            query.addOrderBy(self._varTypeInTable)
            query.addOrderBy('order_time')

            query.setLimit(self._num_requested_episodes)

            self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
                self, query, index_time_col='order_time')

        else:
            '''
            Sqlite3 has an interesting limit for the total number of place_holders in a query, 
            and this limit varies across platforms/operating systems (500-99999 on mac, 999 by defaulty). 
            
            To avoid this problem when querying 1000-10000 patient ids, use string queries instead of the
            default (convenient) routine in DBUtil.  
            '''

            query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id, order_proc_id, %s, order_time, " % self._varTypeInTable
            if not self._isLabPanel:
                query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal "
            else:
                query_str += "CAST(SUM(CASE WHEN result_in_range_yn != 'Y' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal "
            query_str += "FROM labs "
            query_str += "WHERE %s = '%s' " % (self._varTypeInTable,
                                               self._lab_var)
            query_str += "AND pat_id IN "
            pat_list_str = "("
            for pat_id in random_patient_list:
                pat_list_str += str(pat_id) + ","
            pat_list_str = pat_list_str[:-1] + ") "
            query_str += pat_list_str
            query_str += "GROUP BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable
            query_str += "ORDER BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable
            query_str += "LIMIT %d" % self._num_requested_episodes

            self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
                self, query_str, index_time_col='order_time')
Beispiel #20
0
    def _get_random_patient_list(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        query = SQLQuery()
        query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            if self._isLabPanel:
                query.addSelect('COUNT(sop.order_proc_id) AS num_orders')
                query.addFrom('stride_order_proc AS sop')
                query.addFrom('stride_order_results AS sor')

                if self._time_limit:
                    if self._time_limit[0]:
                        query.addWhere("sop.order_time > '%s'" %
                                       self._time_limit[0])
                    if self._time_limit[1]:
                        query.addWhere("sop.order_time < '%s'" %
                                       self._time_limit[1])

                query.addWhere('sop.order_proc_id = sor.order_proc_id')
                query.addWhereIn('proc_code', [self._lab_var])
                '''
                sbala: Technically it's possible for someone to get a lab ordered without getting results
                '''
                query.addWhereIn("base_name", self._lab_components)

            else:
                query.addSelect('COUNT(sor.order_proc_id) AS num_orders')
                query.addFrom('stride_order_proc AS sop')
                query.addFrom('stride_order_results AS sor')
                query.addWhere('sop.order_proc_id = sor.order_proc_id')
                ##
                query.addWhereIn("base_name", [self._lab_var])
        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id , "
            query_str += "COUNT(order_proc_id) AS num_orders "
            query_str += "FROM labs "
            # query_str += " WHERE %s IN (%s) "%(self._varTypeInTable, self._lab_var)
            query_str += "WHERE %s = '%s' " % (self._varTypeInTable,
                                               self._lab_var)
            if self.notUsePatIds:
                query_str += "AND pat_id NOT IN ("
                for pat_id in self.notUsePatIds:
                    query_str += "%s," % pat_id
                query_str = query_str[:-1] + ") "  # get rid of comma
            query_str += "GROUP BY pat_id"

            log.debug('Querying median orders per patient...')

            # TODO: best way to integrate UMich code
            results = DBUtil.execute(query_str)
            order_counts = [row[1] for row in results]

            if len(results) == 0:
                error_msg = '0 orders for order "%s."' % self._lab_var  # sx
                log.critical(error_msg)
                raise Exception(error_msg)
                # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception
            else:
                avg_orders_per_patient = numpy.median(order_counts)
                log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
                # Based on average # of results, figure out how many patients we'd
                # need to get for a feature matrix of requested size.
                self._num_patients = int(numpy.max([self._num_requested_episodes / \
                                                    avg_orders_per_patient, 1]))
                # Some components may have fewer associated patients than the required sample size
                patient_number_chosen = min([len(results),
                                             self._num_patients])  #
                inds_random_patients = numpy.random.choice(
                    len(results), size=patient_number_chosen, replace=False)
                # print 'inds_random_patients:', inds_random_patients
                pat_IDs_random_patients = []
                for ind in inds_random_patients:
                    pat_IDs_random_patients.append(results[ind][0])
                return pat_IDs_random_patients

        else:
            query.addSelect('COUNT(order_proc_id) AS num_orders')
            query.addFrom('labs')
            if self._isLabPanel:
                query.addWhereIn("proc_code", [self._lab_var])
                query.addWhereIn("base_name", self._lab_components)
            else:
                query.addWhereIn("base_name", [self._lab_var])
        '''
        Fo hold-out set, do not use the patients already used in training/validation. 
        '''
        if self._notUsePatIds:
            query.addWhereNotIn('pat_id', self._notUsePatIds)

        query.addGroupBy('pat_id')

        log.debug('Querying the number of orders per patient...')

        results = DBUtil.execute(query)

        order_counts = [row[1] for row in results]

        if len(results) == 0:
            error_msg = '0 orders for component "%s."' % self._lab_var  # sx
            log.critical(error_msg)
            raise Exception(error_msg)
            # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception
        else:
            avg_orders_per_patient = numpy.median(order_counts)
            log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
            # Based on average # of results, figure out how many patients we'd
            # need to get for a feature matrix of requested size.
            self._num_patients = int(numpy.max([self._num_requested_episodes / \
                                                avg_orders_per_patient, 1]))

            # Some components may have fewer associated patients than the required sample size
            patient_number_chosen = min([len(results), self._num_patients])  #
            '''
            Set seed to ensure re-producibility of patient episodes.
            Recover int random_state here, since numpy requires int while sql requires [-1,1]
            '''
            numpy.random.seed(int(self._random_state * float(sys.maxsize)))
            inds_random_patients = numpy.random.choice(
                len(results), size=patient_number_chosen, replace=False)

            pat_IDs_random_patients = [
                results[ind][0] for ind in inds_random_patients
            ]

            return pat_IDs_random_patients
					pat_id, medication_id, description
				"""
				DATA_QUERY = SQLQuery();
				DATA_QUERY.addSelect("pat_id")
				DATA_QUERY.addSelect("medication_id")
				DATA_QUERY.addSelect("description")
				DATA_QUERY.addSelect("COUNT(*) as medication_co_occurrence")

				DATA_QUERY.addFrom("stride_order_med")
				DATA_QUERY.addWhere("pat_id = '{0}'".format(pat_id))
				DATA_QUERY.addWhere("ordering_date >= '{0}'".format(window_lower))
				DATA_QUERY.addWhere("ordering_date <= '{0}'".format(window_upper))
				if (pat_id in orderset_order_med_ids.keys()):
					DATA_QUERY.addWhere("order_med_id NOT IN ({0})".format(order_med_id_str)) # do not count order instances ordered with an order set; we only want a la carte instances

				DATA_QUERY.addGroupBy("pat_id")
				DATA_QUERY.addGroupBy("medication_id")
				DATA_QUERY.addGroupBy("description")

				# print(DATA_QUERY)
				results = DBUtil.execute(DATA_QUERY);
				for line in results:
					medication = int(line[1])
					description = line[2]
					count = int(line[3])

					order_description_map[medication] = description.replace(",", ";")

					# consider only orders already in orderset
					if (orderset in orderset_orders and medication in orderset_orders[orderset].keys()): # medication is in order set
						if (medication not in co_occurrence_medication_counts):
Beispiel #22
0
def main(argv):
    medIdsByActiveRx = dict()
    medIdsByActiveRx['Buprenorphine'] = ('125498', '114474', '212560',
                                         '114475', '114467', '114468')
    medIdsByActiveRx['Fentanyl Patch'] = ('2680', '27908', '125379', '27905',
                                          '27906', '540107', '540638',
                                          '540101', '27907')
    medIdsByActiveRx['Methadone'] = ('540483', '4953', '4951', '10546',
                                     '214468', '15996', '41938', '4954',
                                     '4952')
    medIdsByActiveRx['Hydrocodone'] = ('3724', '4579', '8576', '8577', '8951',
                                       '10204', '12543', '13040', '14963',
                                       '14965', '14966', '17061', '17927',
                                       '19895', '20031', '28384', '29486',
                                       '29487', '34505', '34544', '35613',
                                       '117862', '204249', '206739')
    medIdsByActiveRx['Hydromorphone'] = ('2458', '2459', '2464', '2465',
                                         '3757', '3758', '3759', '3760',
                                         '3761', '10224', '10225', '10226',
                                         '10227', '200439', '201094', '201096',
                                         '201098', '540125', '540179',
                                         '540666')
    medIdsByActiveRx['Morphine'] = (
        '5167', '5168', '5172', '5173', '5176', '5177', '5178', '5179', '5180',
        '5183', '6977', '10655', '15852', '20908', '20909', '20910', '20914',
        '20915', '20919', '20920', '20921', '20922', '29464', '30138', '31413',
        '36140', '36141', '79691', '87820', '89282', '91497', '95244', '96810',
        '112562', '112564', '115335', '115336', '126132', '198543', '198544',
        '198623', '201842', '201848', '205011', '206731', '207949', '208896',
        '540182', '540300')
    medIdsByActiveRx['Oxycodone'] = ('5940', '5941', '6122', '6981', '10812',
                                     '10813', '10814', '14919', '16121',
                                     '16123', '16129', '16130', '19187',
                                     '26637', '26638', '27920', '27921',
                                     '27922', '27923', '28897', '28899',
                                     '28900', '31851', '31852', '31863',
                                     '31864', '92248', '126939', '200451',
                                     '203690', '203691', '203692', '203705',
                                     '203706', '203707', '204020', '204021')

    query = baseQuery()
    totalPatients = float(DBUtil.execute(query)[0][0])

    # print"Total Patients\t%s" % totalPatients

    # print"======= Dx Groups ===========";
    # print"Dx Group\tPt Count\tDx Rate";
    patientsPerDxGroup = dict()
    query = SQLQuery()
    query.addSelect("count(distinct prob.pat_id) as ptCount")
    query.addSelect("prob.%s" % DX_COL)
    query.addFrom("stride_problem_list as prob")
    query.addWhereOp("prob.noted_date", "<", LIMIT_DATE)
    query.addGroupBy("prob.%s" % DX_COL)
    if DOUBLE_DX:
        query.addSelect("prob2.%s" % DX_COL)
        query.addFrom("stride_problem_list as prob2")
        query.addWhere("prob.pat_id = prob2.pat_id")
        query.addWhereOp("prob2.noted_date", "<", LIMIT_DATE)
        query.addGroupBy("prob2.%s" % DX_COL)
    results = DBUtil.execute(query)
    for row in results:
        patientCount = row[0]
        dxGroup = row[1]
        if DOUBLE_DX:
            dxGroup = (dxGroup, row[2])
            # Composite tuple including second diagnosis
        patientsPerDxGroup[dxGroup] = patientCount

    progress = ProgressDots()
    for activeRx, medIds in medIdsByActiveRx.iteritems():
        query = baseQuery()
        query.addWhereIn("medication_id", medIds)

        # Baseline prescription rates
        rxPtCount = DBUtil.execute(query)[0][0]

        # print"====== Rx Counts ======";
        # print"Rx\tPt Count\tRx Rate";
        # print"%s\t%s\t%s" % (activeRx, rxPtCount, (rxPtCount/totalPatients));

        # print"======== Rx-Dx Association ========";
        statIds = (
            "P-Fisher",
            "P-YatesChi2",
            "oddsRatio",
            "relativeRisk",
            "interest",
            "LR+",
            "LR-",
            "sensitivity",
            "specificity",
            "PPV",
            "NPV",
        )
        if progress.getCounts() == 0:
            headerCols = [
                "Rx", "Dx", "RxDxCount", "RxCount", "DxCount", "Total"
            ]
            if DOUBLE_DX:
                headerCols.insert(2, "Dx2")
            headerCols.extend(statIds)
            headerStr = str.join("\t", headerCols)
            print headerStr

        # Query out per diagnosis group, but do as aggregate grouped query
        query.addSelect("prob.%s" % DX_COL)
        query.addFrom("stride_problem_list as prob")
        query.addWhere("med.pat_id = prob.pat_id")
        query.addWhereOp("prob.noted_date", "<", LIMIT_DATE)
        #query.addWhereIn("prob.%s" % DX_COL, dxKeys );
        query.addGroupBy("prob.%s" % DX_COL)
        if DOUBLE_DX:
            query.addSelect("prob2.%s" % DX_COL)
            query.addFrom("stride_problem_list as prob2")
            query.addWhere("prob.pat_id = prob2.pat_id")
            query.addWhereOp("prob2.noted_date", "<", LIMIT_DATE)
            query.addGroupBy("prob2.%s" % DX_COL)
        results = DBUtil.execute(query)
        for row in results:
            rxDxPtCount = row[0]
            dxGroup = row[1]
            if DOUBLE_DX:
                dxGroup = (dxGroup, row[2])
                # Composite tuple including second diagnosis
            dxPtCount = patientsPerDxGroup[dxGroup]

            conStats = ContingencyStats(rxDxPtCount, rxPtCount, dxPtCount,
                                        totalPatients)

            dataCells = [
                activeRx, dxGroup, rxDxPtCount, rxPtCount, dxPtCount,
                totalPatients
            ]
            if DOUBLE_DX:
                dataCells[1] = dxGroup[0]
                dataCells.insert(2, dxGroup[1])
            for statId in statIds:
                try:
                    dataCells.append(conStats[statId])
                except ZeroDivisionError:
                    dataCells.append(None)
            for i, value in enumerate(dataCells):
                dataCells[i] = str(value)
                # String conversion to allow for concatenation below
            dataStr = str.join("\t", dataCells)
            print dataStr
            progress.update()
    progress.printStatus()
Beispiel #23
0
    def _query_patient_episodes(self):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()
        self.usedPatIds = random_patient_list[:]

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  # TODO: component
            query = SQLQuery()
            query.addSelect('CAST(pat_id AS BIGINT)')
            query.addSelect('sop.order_proc_id AS order_proc_id')
            query.addSelect('proc_code')  #TODO:sx
            query.addSelect('order_time')
            query.addSelect(
                "CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel"
            )  #sx
            query.addSelect(
                "SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components"
            )  #sx
            query.addSelect(
                "SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components"
            )  #sx
            query.addSelect(
                "CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal"
            )  #sx
            query.addFrom('stride_order_proc AS sop')  #sx
            query.addFrom('stride_order_results AS sor')  #sx
            query.addWhere('sop.order_proc_id = sor.order_proc_id')  #sx
            query.addWhere(
                "(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)"
            )
            query.addWhereIn("proc_code", [self._lab_var])  #sx
            query.addWhereIn("pat_id", random_patient_list)
            query.addGroupBy('pat_id')
            query.addGroupBy('sop.order_proc_id')  #sx
            query.addGroupBy('proc_code')  #sx
            query.addGroupBy('order_time')
            query.addGroupBy('abnormal_yn')  #sx
            query.addOrderBy('pat_id')
            query.addOrderBy('sop.order_proc_id')  #sx
            query.addOrderBy('proc_code')  #sx
            query.addOrderBy('order_time')
            query.setLimit(self._num_requested_episodes)

            self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
                self, query, index_time_col='order_time')

        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            # query = SQLQuery()
            # query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')
            # query.addSelect('order_proc_id')
            # query.addSelect(self._varTypeInTable)
            # query.addSelect('order_time')
            #
            # query.addWhereIn(self._varTypeInTable, [self._lab_var])
            #
            # if self._isLabPanel:
            #
            #     query.addSelect("SUM(CASE WHEN result_in_range_yn IN ('N', 'Y') THEN 1 ELSE 0 END) AS num_components")
            #     query.addSelect("SUM(CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END) AS num_normal_components")
            #     query.addSelect("CAST(SUM(CASE WHEN result_in_range_yn = 'N' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal") #TODO
            # else:
            #     query.addSelect("CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal")
            #
            # query.addFrom('labs')
            # query.addWhereIn("pat_id", random_patient_list)
            #
            # query.addGroupBy('pat_id')
            # query.addGroupBy('order_proc_id')
            # query.addGroupBy(self._varTypeInTable)
            # query.addGroupBy('order_time')
            #
            # query.addOrderBy('pat_id')
            # query.addOrderBy('order_proc_id')
            # query.addOrderBy(self._varTypeInTable)
            # query.addOrderBy('order_time')
            #
            # query.setLimit(self._num_requested_episodes)
            #
            # print query

            query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id, order_proc_id, %s, order_time, " % self._varTypeInTable
            if not self._isLabPanel:
                query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal "
            else:
                query_str += "CAST(SUM(CASE WHEN result_in_range_yn != 'Y' THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal "
            # query_str += "CASE WHEN result_in_range_yn = 'Y' THEN 1 ELSE 0 END AS component_normal "
            query_str += "FROM labs "
            query_str += "WHERE %s = '%s' " % (self._varTypeInTable,
                                               self._lab_var)
            query_str += "AND pat_id IN "
            pat_list_str = "("
            for pat_id in random_patient_list:
                pat_list_str += str(pat_id) + ","
            pat_list_str = pat_list_str[:-1] + ") "
            query_str += pat_list_str
            query_str += "GROUP BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable
            query_str += "ORDER BY pat_id, order_proc_id, %s, order_time " % self._varTypeInTable
            query_str += "LIMIT %d" % self._num_requested_episodes

            self._num_reported_episodes = FeatureMatrix._querystr_patient_episodes(
                self, query_str, index_time_col='order_time')
Beispiel #24
0
    def _query_patient_episodes(self, pat_id_col="pat_deid"):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()

        print("RANDOM PATIENT LIST", random_patient_list)

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.
        query = SQLQuery()
        #query.addSelect('CAST(micro.pat_deid AS BIGINT)')
        query.addSelect('CAST(micro.order_time AS TIMESTAMP)')
        #query.addSelect('sop.order_proc_id AS order_proc_id')
        query.addSelect('proc_code')
        query.addSelect('order_time')
        query.addSelect('organism_name') #one for the result
        # query.addSelect("CASE WHEN abnormal_yn = 'Y' THEN 1 ELSE 0 END AS abnormal_panel")
        # query.addSelect("SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL THEN 1 ELSE 0 END) AS num_components")
        # query.addSelect("SUM(CASE WHEN result_flag IS NULL THEN 1 ELSE 0 END) AS num_normal_components")
        # query.addSelect("CAST(SUM(CASE WHEN result_flag IN ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') THEN 1 ELSE 0 END) = 0 AS INT) AS all_components_normal")
        # query.addFrom('stride_order_proc AS sop')
        # query.addFrom('stride_order_results AS sor')
        query.addFrom('micro')
        #query.addWhere('sop.order_proc_id = sor.order_proc_id')
        #query.addWhere("(result_flag in ('High', 'Low', 'High Panic', 'Low Panic', '*', 'Abnormal') OR result_flag IS NULL)")
        #query.addWhereIn("micro.proc_code", [self._lab_panel])
        query.addWhereIn("micro.pat_deid", random_patient_list)
        query.addGroupBy('micro.pat_deid')
        #query.addGroupBy('sop.order_proc_id')
        query.addGroupBy('proc_code')
        query.addGroupBy('order_time')
        query.addGroupBy('organism_name')
        #query.addGroupBy('abnormal_yn')
        query.addOrderBy('micro.pat_deid')
        #query.addOrderBy('sop.order_proc_id')
        query.addOrderBy('proc_code')
        query.addOrderBy('micro.order_time')
        query.setLimit(self._num_requested_episodes)

        self._num_reported_episodes = FeatureMatrix._query_patient_episodes(self, query, index_time_col='order_time')
Beispiel #25
0
    def grade_cases(self, sim_patient_ids, sim_grader_id, conn=None):
        """Given the identifiers for a bunch of simulated physician-patient case
        records, and the identifier for a particular grading key to use,
        calculate what grade each case would get based on the choices made
        and return a dictionary of case grades (keyed by the case ID).
        """
        ext_conn = True
        if conn is None:
            conn = self.connFactory.connection()
            ext_conn = False
        try:
            # Inner query retrieves physician-patient cases with ranking group_names (to later select first)
            # per case for specified cases. Each NULL group_name is treated as a separate group by assigning it
            # sim_patient_order_id. It also omits Default user (sim_user_id = 0) from grading.
            inner_query = SQLQuery()
            inner_query.addSelect("score")
            inner_query.addSelect(
                "rank() over ("  # ranks rows incrementally in the same group
                "    partition by coalesce(group_name, sim_patient_order_id::text), sim_patient_id"
                "    order by sim_patient_order_id"
                ")")
            inner_query.addSelect("sim_user_id")
            inner_query.addSelect("sim_patient_id")
            inner_query.addSelect("sim_grader_id")
            inner_query.addFrom("sim_patient_order spo")
            inner_query.addJoin(
                "sim_grading_key sgk",
                "sgk.clinical_item_id = spo.clinical_item_id"
                "    and sgk.sim_state_id = spo.sim_state_id")
            inner_query.addWhereEqual("sgk.sim_grader_id", sim_grader_id)
            inner_query.addWhereNotEqual(
                "spo.sim_user_id",
                0)  # 0 = ignore 'Default user', sets up initial cases
            inner_query.addWhereIn("spo.sim_patient_id", sim_patient_ids)
            inner_query.addOrderBy("relative_time_start")
            inner_query.addOrderBy("sim_patient_order_id")

            # Outer query sums the score per patient case and selects most graded physician for the case.
            # Theoretically, it isn't necessarily the most active physician for the case since his orders
            # might have been dropped by selecting only the first record within group_name group.
            query = SQLQuery()
            query.addSelect("sim_patient_id")
            query.addSelect("sim_grader_id")
            query.addSelect("sum(score) as total_score")
            query.addSelect(
                "mode() within group ("  # mode() selects most frequent value within group
                "    order by sim_user_id"
                ") as most_graded_user_id")
            query.addFrom("(" + str(inner_query) + ") as ranked_groups")
            query.addWhereEqual("ranked_groups.rank",
                                1)  # count only first order in the same group
            query.addGroupBy("sim_patient_id")
            query.addGroupBy("sim_grader_id")

            query_params = inner_query.getParams() + query.getParams()
            grades_table = DBUtil.execute(query,
                                          query_params,
                                          includeColumnNames=True,
                                          conn=conn)
            grades_model = modelListFromTable(grades_table)

            # get most active users for the cases
            most_active_user_query = SQLQuery()
            most_active_user_query.addSelect("sim_patient_id")
            most_active_user_query.addSelect("mode() within group ("
                                             "    order by sim_user_id"
                                             ") as most_active_user_id")
            most_active_user_query.addFrom("sim_patient_order")
            most_active_user_query.addWhereNotEqual("sim_user_id",
                                                    0)  # ignore Default user
            most_active_user_query.addWhereIn("sim_patient_id",
                                              sim_patient_ids)
            most_active_user_query.addGroupBy("sim_patient_id")
            most_active_user_query.addOrderBy("sim_patient_id")

            most_active_user_table = DBUtil.execute(most_active_user_query,
                                                    includeColumnNames=True,
                                                    conn=conn)
            most_active_user_model = modelListFromTable(most_active_user_table)
            # make a dict by sim_patient_id out of results - will be used for combining
            most_active_user_dict = {
                most_active_user["sim_patient_id"]: most_active_user
                for most_active_user in most_active_user_model
            }

            # combine results
            complete_grades = [
                grade.update(most_active_user_dict[grade["sim_patient_id"]])
                for grade in grades_model
            ]

            return complete_grades

        finally:
            if not ext_conn:
                conn.close()
Beispiel #26
0
    def _query_patient_episodes(self):
        log.info('Querying patient episodes...')
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Build parameters for query.
        self._lab_components = self._get_components_in_lab_panel()
        random_patient_list = self._get_random_patient_list()

        #print("RANDOM PATIENT LIST", random_patient_list)

        # Build SQL query for list of patient episodes.
        # Note that for 2008-2014 data, result_flag can take on any of the
        # following values:  High, Low, High Panic, Low Panic,
        # Low Off-Scale, Negative, Positive, Resistant, Susceptible, Abnormal, *
        # (NONE): Only 27 lab components can have this flag. None has this
        #           value for more than 5 results, so ignore it.
        # *: Only 10 lab components can have this flag. Only 6 have it for
        #           >10 tests, and each of them is a microbiology test for which
        #           a flag is less meaningful, e.g. Gram Stain, AFB culture.
        # Susceptible: Only 15 lab components can have this flag. All of those
        #           only have this value for 2 results, so ignore it.
        # Resistant: Only 1 lab component can have this flag. Only two results
        #           have this value, so ignore it.
        # Abnormal: 1462 lab components can have this flag. Many (e.g. UBLOOD)
        #           have this value for thousands of results, so include it.
        # Negative: Only 10 lab components can have this flag, and all for
        #           less than 5 results, so ignore it.
        # Positive: Only 3 lab components can have this flag, and all for
        #           only 1 result, so ignore it.
        # Low Off-Scale: Only 1 lab component can have this flag, and only for
        #           3 results, so ignore it.
        # Low Panic: 1401 lab components can have this flag, many core
        #           metabolic components. Include it.
        # High Panic: 8084 lab components can have this flag, many core
        #           metabolic components. Include it.
        query = SQLQuery()
        query.addSelect('CAST(pat_anon_id AS BIGINT) AS pat_id')
        query.addSelect('CAST(shifted_order_time AS TIMESTAMP)')
        query.addSelect('stride_culture_micro.proc_code')
        query.addSelect('organism_name')  #one for the result

        # Experimenting
        susceptibility_flags = [
            'Trimethoprim/Sulfamethoxazole', 'Vancomycin', 'Penicillin',
            'Levofloxacin', 'Clindamycin', 'Ceftriaxone', 'Erythromycin',
            'Ampicillin', 'Meropenem', 'Ciprofloxacin', 'Cefepime',
            'Aztreonam.', 'Ampicillin/Sulbactam', 'Piperacillin/Tazobactam',
            'Linezolid', 'Oxacillin.', 'Cefazolin', 'Daptomycin'
        ]

        for med in susceptibility_flags:
            med_col = med.replace('/', '_').replace('.', '')
            query.addSelect(
                "MAX(CASE WHEN antibiotic_name = '%s' AND (suseptibility = 'Susceptible' OR suseptibility = 'Positive') THEN 1 ELSE 0 END) AS %s_Susc"
                % (med, med_col))
            query.addSelect(
                "MAX(CASE WHEN antibiotic_name = '%s' THEN 1 ELSE 0 END) as %s_tested"
                % (med, med_col))

        # Let us look at top 10 commonly occuring bacteria
        query.addSelect(
            "CASE WHEN organism_name IS NULL THEN 0 ELSE 1 END AS BACTERIA_PRESENT"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ESCHERICHIA COLI' THEN 1 ELSE 0 END AS ESCHERICHIA_COLI"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'STAPHYLOCOCCUS AUREUS' THEN 1 ELSE 0 END AS STAPHYLOCOCCUS_AUREUS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ENTEROCOCCUS SPECIES' THEN 1 ELSE 0 END AS ENTEROCOCCUS_SPECIES"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'KLEBSIELLA PNEUMONIAE' THEN 1 ELSE 0 END AS KLEBSIELLA_PNEUMONIAE"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'PSEUDOMONAS AERUGINOSA' THEN 1 ELSE 0 END AS PSEUDOMONAS_AERUGINOSA"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'COAG NEGATIVE STAPHYLOCOCCUS' THEN 1 ELSE 0 END AS COAG_NEGATIVE_STAPHYLOCOCCUS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'ENTEROCOCCUS FAECALIS' THEN 1 ELSE 0 END AS ENTEROCOCCUS_FAECALIS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'PROTEUS MIRABILIS' THEN 1 ELSE 0 END AS PROTEUS_MIRABILIS"
        )
        query.addSelect(
            "CASE WHEN organism_name = 'CANDIDA ALBICANS' THEN 1 ELSE 0 END AS CANDIDA_ALBICANS"
        )

        query.addFrom('stride_culture_micro')

        query.addWhereIn("stride_culture_micro.proc_code", self._lab_panel)
        query.addWhereIn("pat_anon_id", random_patient_list)
        query.addGroupBy('pat_anon_id')
        query.addGroupBy('shifted_order_time')
        query.addGroupBy('stride_culture_micro.proc_code')
        query.addGroupBy('organism_name')
        query.addOrderBy('pat_anon_id')
        query.addOrderBy('shifted_order_time')
        query.addOrderBy('stride_culture_micro.proc_code')
        query.addOrderBy('organism_name')
        query.setLimit(self._num_requested_episodes)

        self._num_reported_episodes = FeatureMatrix._query_patient_episodes(
            self,
            query,
            pat_id_col='pat_id',
            index_time_col='shifted_order_time')
    def test_processPatientListInput(self):
        """Test processPatientListInput()."""
        # Verify FeatureMatrixFactory throws Error if patientListInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientListInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patients.
        patientListQuery = SQLQuery()
        patientListQuery.addSelect("CAST(pat_id AS bigint)")
        patientListQuery.addFrom("stride_order_proc")
        patientListQuery.addWhere("proc_code = 'LABMETB'")
        patientListQuery.addGroupBy("pat_id")
        patientListQuery.addOrderBy("1 ASC")
        cursor.execute(str(patientListQuery), patientListQuery.params)

        # Set and process patientListInput.
        self.factory.setPatientListInput(cursor, "pat_id")
        self.factory.processPatientListInput()
        resultPatientIterator = self.factory.getPatientListIterator()

        # Verify results.
        expectedPatientList = ["-789", "-456", "-123"]
        for expectedPatientId in expectedPatientList:
            resultPatientId = resultPatientIterator.next()['pat_id']
            self.assertEqual(resultPatientId, expectedPatientId)
        resultPatientIterator.close()

        # Build TSV file for list of patients.
        patientList = \
            "patient_item_id\tpatient_id\tclinical_item_id\titem_date\n\
            -1000\t-123\t-100\t10/6/2113 10:20\n\
            -2000\t-123\t-200\t10/6/2113 11:20\n\
            -2500\t-123\t-100\t10/7/2113 11:20\n\
            -3000\t-456\t-100\t11/6/2113 10:20\n\
            -6000\t-789\t-200\t12/6/2113 11:20\n"

        patientListTsv = open("patient_list.tsv", "w")
        patientListTsv.write(patientList)
        patientListTsv.close()

        # Initialize new FeatureMatrixFactory.
        self.factory = FeatureMatrixFactory()

        # Set and process patientListInput.
        patientListTsv = open("patient_list.tsv", "r")
        self.factory.setPatientListInput(patientListTsv, "patient_id")
        self.factory.processPatientListInput()
        resultPatientIterator = self.factory.getPatientListIterator()

        # Verify results.
        expectedPatientList = ["-123", "-123", "-123", "-456", "-789"]
        for expectedPatientId in expectedPatientList:
            resultPatientId = resultPatientIterator.next()['patient_id']
            self.assertEqual(resultPatientId, expectedPatientId)
        patientListTsv.close()
        resultPatientIterator.close()

        # Clean up patient_list.
        try:
            os.remove("patient_list.tsv")
            os.remove("fmf.patient_list.tsv")
        except OSError:
            pass
    def test_buildFeatureMatrix_multiFlowsheet(self):
        """
        Test buildFeatureMatrix and addFlowsheet.
        """
        # Verify FeatureMatrixFactory throws Error if patientEpisodeInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientEpisodeInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()
        resultEpisodeIterator = self.factory.getPatientEpisodeIterator()
        resultPatientEpisodes = list()
        for episode in resultEpisodeIterator:
            episode["pat_id"] = int(episode["pat_id"])
            episode["order_time"] = DBUtil.parseDateValue(
                episode["order_time"])
            resultPatientEpisodes.append(episode)

        # Verify results (note sort order).
        expectedPatientEpisodes = FM_TEST_OUTPUT[
            "test_processPatientEpisodeInput"]
        self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes)

        # Add flowsheet features.
        flowsheetNames = ["Resp", "FiO2", "Glasgow Coma Scale Score"]
        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)
        self.factory.addFlowsheetFeatures(flowsheetNames, preTimeDelta,
                                          postTimeDelta)
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()

        # Verify results.
        expectedMatrix = FM_TEST_OUTPUT[
            "test_buildFeatureMatrix_multiFlowsheet"]["expectedMatrix"]
        self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5)

        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
    def test_performance(self):
        """
        Test performance against DataExtractor.
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Initialize FeatureMatrixFactory.
        factoryStart = time.time()
        self.factory = FeatureMatrixFactory()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Add clinical item features.
        self.factory.addClinicalItemFeatures(["PerfItem300"])
        self.factory.addClinicalItemFeatures(["PerfItem400"])
        self.factory.addClinicalItemFeatures(["PerfItem500"])

        # Add lab result features.
        self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta,
                                          postTimeDelta)

        # Add flowsheet features.
        self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta,
                                          postTimeDelta)

        # Build matrix.
        self.factory.buildFeatureMatrix()

        # Stop timer.
        factoryStop = time.time()

        # Initialize DataExtractor.
        extractorStart = time.time()
        extractor = DataExtractor()
        extractor.dataCache = dict()

        # Initialize output file.
        outFile = open("extractor.feature_matrix.tab.gz", "w")
        formatter = TextResultsFormatter(outFile)

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Process patient episodes.
        patientEpisodes = list()
        row = cursor.fetchone()

        while row is not None:
            (pat_id, order_proc_id, proc_code, order_time,
             normal_results) = row
            patientEpisode = \
                RowItemModel \
                (
                    {
                        "patient_id": pat_id,
                        "order_proc_id": order_proc_id,
                        "proc_code": proc_code,
                        "order_time": order_time,
                        "result_normal_count": normal_results
                    }
                )
            patientEpisodes.append(patientEpisode)
            row = cursor.fetchone()

        # Initialize patient data.
        lastPatientId = None
        colNames = None
        patientEpisodeByIndexTime = None

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Populate patient data.
        tempColNames = \
            ["patient_id", "order_proc_id", "proc_code", "order_time",
                "result_normal_count"]
        for patientEpisode in patientEpisodes:
            patientId = patientEpisode["patient_id"]

            if lastPatientId is not None and lastPatientId != patientId:
                # New patient ID so start querying for patient specific data and
                # populating patient episode data.

                # Clinical Item (PerfItem300)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem300",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem400)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem400",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem500)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem500",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Lab Result (Foo)
                labResultTable = extractor.queryLabResults(["Foo"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Bar)
                labResultTable = extractor.queryLabResults(["Bar"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Baz)
                labResultTable = extractor.queryLabResults(["Baz"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Qux)
                labResultTable = extractor.queryLabResults(["Qux"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
                    preTimeDelta, postTimeDelta))

                # Flowsheet (Perflow)
                # tempFile = StringIO()
                # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile)
                # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\
                #     StringIO(tempFile.getvalue()))
                # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\
                #     patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \
                #     ["Perflow"], preTimeDelta, postTimeDelta, tempColNames))

                if colNames is None:
                    # First row, print header row
                    colNames = tempColNames
                    formatter.formatTuple(colNames)

                # Print out patient (episode) data (one row per episode)
                formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                            colNames)

            if lastPatientId is None or lastPatientId != patientId:
                # Prepare to aggregate patient episode record per patient
                patientEpisodeByIndexTime = dict()

            patientEpisodeByIndexTime[
                patientEpisode["order_time"]] = patientEpisode
            lastPatientId = patientId
            outFile.flush()

        # Last Iteration
        patientId = lastPatientId
        # Clinical Item (PerfItem300)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem300",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem400)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem400",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem500)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem500",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Lab Result (Foo)
        labResultTable = extractor.queryLabResults(["Foo"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Bar)
        labResultTable = extractor.queryLabResults(["Bar"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Baz)
        labResultTable = extractor.queryLabResults(["Baz"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Qux)
        labResultTable = extractor.queryLabResults(["Qux"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
            preTimeDelta, postTimeDelta))

        formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                    colNames)

        # Close file.
        outFile.close()

        # Stop timer.
        extractorStop = time.time()

        # Compare results.
        factoryTime = factoryStop - factoryStart
        extractorTime = extractorStop - extractorStart
        self.assertTrue(extractorTime > factoryTime)

        # Clean up feature matrix files.
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except OSError:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass