Ejemplo n.º 1
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog <inputFile> <outputFile>\n"+\
                    "   <inputFile>     Tab-delimited input file taken from schedule Excel file. Example data format as seen in test case examples. See support/extractExcelSheets.py for help on pulling out Excel sheets into tab-delimited data files.\n"+\
                    "   <outputFile>    File to output results to.  Designate '-' for stdout.";
        parser = OptionParser(usage=usageStr)
        parser.add_option("-i", "--providerIdFilename",  dest="providerIdFilename", help="Name of provider ID CSV file. If provided, then add column for prov_id based on resident first_name and last_name, match within first "+DEFAULT_INDEX_PREFIX_LENGTH+" characters, or generate ID value if no match found");
        parser.add_option("-y", "--baseYear",  dest="baseYear", help="Year expect dates to start in.");
        parser.add_option("-t", "--changeTime",  dest="changeTime", default=CHANGE_TIME, help="Hour of day that count as delimiter between rotations. Likely should NOT be midnight = 0, because night shifts span midnight. Default to 7 = 7am.");
        (options, args) = parser.parse_args(argv[1:])

        if len(args) >= 2 and options.baseYear:
            log.info("Starting: "+str.join(" ", argv))
            timer = time.time();

            baseYear = int(options.baseYear);

            if options.providerIdFilename is not None:
                providerReader = csv.DictReader(open(options.providerIdFilename));
                self.loadProviderModels( providerReader );

            inFile = stdOpen(args[0]);
            scheduleItems = self.parseScheduleItems(inFile, baseYear);

            outFile = stdOpen(args[1],"w");
            formatter = TextResultsFormatter(outFile);
            formatter.formatResultDicts(scheduleItems);
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
Ejemplo n.º 2
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format use generated LDA models to predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-M", "--modelFile",  dest="modelFile", help="Name of the file to load an LDA or HDP model and topic word document counts from.");
        parser.add_option("-X", "--excludeCategoryIds",  dest="excludeCategoryIds", help="For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids.");
        parser.add_option("-i", "--itemsPerCluster",  dest="itemsPerCluster", default=DEFAULT_TOPIC_ITEM_COUNT, help="Specify number of top topic items to consider when scoring recommendations.");
        parser.add_option("-m", "--minClusterWeight",  dest="minClusterWeight", default=DEFAULT_MIN_TOPIC_WEIGHT, help="When scoring recommendations, skip any topics with less than this relation weight (effectively scores as zero, but can avoid a lot of low yield calculations).");
        parser.add_option("-s", "--sortField",  dest="sortField", default=DEFAULT_SORT_FIELD, help="Score field to sort top recommendations by.  Default to posterior probabilty 'totelItemWeight', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting.");
        parser.add_option("-r", "--numRecs",   dest="numRecs",  default=DEFAULT_RECOMMENDED_ITEM_COUNT, help="Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size.");
        parser.add_option("-O", "--numRecsByOrderSet",   dest="numRecsByOrderSet", action="store_true", help="If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider.");
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        if len(args) >= 1:
            query = AnalysisQuery();
            query.preparedPatientItemFile = stdOpen(args[0]);
            query.recommender = TopicModelRecommender(options.modelFile);
            query.baseRecQuery = RecommenderQuery();
            if options.excludeCategoryIds is not None:
                query.baseRecQuery.excludeCategoryIds = set();
                for categoryIdStr in options.executeCategoryIds.split(","):
                    query.baseRecQuery.excludeCategoryIds.add(int(categoryIdStr));
            else:   # Default exclusions if none specified
                query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds();
                query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds();
            query.baseRecQuery.itemsPerCluster = int(options.itemsPerCluster);
            query.baseRecQuery.minClusterWeight = float(options.minClusterWeight);

            query.baseRecQuery.sortField = options.sortField;
            query.numRecommendations = int(options.numRecs);
            query.numRecsByOrderSet = options.numRecsByOrderSet;

            # Run the actual analysis
            analysisResults = self(query);

            # Format the results for output
            outputFilename = None;
            if len(args) > 1:
                outputFilename = args[1];
            outputFile = stdOpen(outputFilename,"w");

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {"argv": argv};
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile);

            formatter = TextResultsFormatter( outputFile );
            colNames = self.resultHeaders(query);
            formatter.formatTuple( colNames );  # Insert a mock record to get a header / label row
            formatter.formatResultDicts( analysisResults, colNames );
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
Ejemplo n.º 3
0
def main(argv=None):
    timer = time.time()

    # Final columns to output to patient matrix
    colNames = list()

    patientById = parsePatientFile(stdOpen("patients.tab"), colNames)

    labsByBaseNameByPatientId = parseLabResultsFile(stdOpen("labs.tab"))
    addLabFeatures(labsByBaseNameByPatientId, patientById, colNames,
                   INDEX_ITEM_BASE_NAME, LAB_BASE_NAMES, LAB_PRE_TIME,
                   LAB_POST_TIME)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    itemTimesByPatientId = parseClinicalItemFile(stdOpen("admitDx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ICD9.208-AdmitDx")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("problemListDx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ICD9.208-ProblemListDx")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("feSO4Rx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironSO4")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("allEnteralIron.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironEnteral")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("ironIV.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironIV")

    itemTimesByPatientId = parseClinicalItemFile(
        stdOpen("outpatientIronRx.tab"),
        patientIdCol="pat_id",
        timeCol="ordering_date")
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironOutpatient")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("transfusions.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "RBCTransfusion")

    patientResults = filterPatients(patientById)

    log.info("Output feature matrix file with row per patient")
    featureMatrixFile = stdOpen("featureMatrix.lab14to1day.tab", "w")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatResultDicts(patientResults, colNames, addHeaderRow=True)

    timer = time.time() - timer
    print("%.3f seconds to complete" % timer, file=sys.stderr)
Ejemplo n.º 4
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <trainFile> <testFile> [<outputFile>]\n"+\
                    "   <trainFile> Tab-delimited file, queryItemIdsJSON expected to be parseable into lists of query items as well as an outcome.X column\n"+\
                    "   <testFile> Same structure as trainFile, but with test cases to assess prediction scoring\n"+\
                    "   <outputFile>    Tab-delimited that can be used for ROC analysis with columns for outcome and predicted score\n"+\
                    ""
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-o",
            "--outcomeItemId",
            dest="outcomeItemId",
            help="Outcome item IDs to assess get prediction scores for")

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 2:
            trainFile = stdOpen(args[0])
            testFile = stdOpen(args[1])

            outcomeId = int(options.outcomeItemId)

            # Run the actual analysis
            (featureMatrix, outcomeMatrix, queryIds,
             rowModels) = self.fileToMatrixes(trainFile, outcomeId)
            model = self.train(featureMatrix, outcomeMatrix)
            analysisResults = self.predict(testFile, model, queryIds,
                                           outcomeId)

            # Format the results for output
            outputFilename = None
            if len(args) > 2:
                outputFilename = args[2]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile)

            colNames = self.analysisHeaders(outcomeId)
            analysisResults.insert(0, RowItemModel(colNames, colNames))
            # Insert a mock record to get a header / label row

            formatter = TextResultsFormatter(outputFile)
            formatter.formatResultDicts(analysisResults, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 5
0
def queryPatients(outputFile):
    log.info("Select patients with any result for a ferritin test")
    patientById = dict()
    query = \
        """select distinct pat_id
        from 
          stride_order_results as sor,
          stride_order_proc as sop
        where 
          sor.order_proc_id = sop.order_proc_id and
          base_name = 'ferritin'
        """
    results = DBUtil.execute(query)
    for (patientId, ) in results:
        patientId = int(patientId)
        patientById[patientId] = RowItemModel({"patient_id": patientId})

    log.info("Patients with admit or diet orders for surgery")
    # Not perfectly accurate for isolating surgical patients
    for patient in patientById.itervalues():
        patient["surgery"] = 0
        # Default to 0 / false
    query = \
        """select distinct patient_id
        from patient_item
        where clinical_item_id in (3614,4177,4220)
        """
    results = DBUtil.execute(query)
    for (patientId, ) in results:
        if patientId in patientById:
            patientById[patientId]["surgery"] = 1

    log.info("Patients with an order for dialysis")
    # (Does not differentiate acute vs. chronic.  Includes peritoneal)
    for patient in patientById.itervalues():
        patient["dialysis"] = 0
        # Default to 0 / false
    query = \
        """select distinct patient_id
        from patient_item
        where clinical_item_id in (1815,3783,4322)
        """
    results = DBUtil.execute(query)
    for (patientId, ) in results:
        if patientId in patientById:
            patientById[patientId]["dialysis"] = 1

    # Drop results as tab-delimited text output
    formatter = TextResultsFormatter(outputFile)
    formatter.formatResultDicts(patientById.itervalues(), addHeaderRow=True)

    return patientById
Ejemplo n.º 6
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile> Tab-delimited file, first two labeled columns expected to represent labeled outcome (0 and non-zero) and score/probability of outcome\n"+\
                    "   <outputFile>    Tab-delimited table specifying score histogram bin widths, total cases, predicted events, actual events\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-b", "--bins",  dest="nBins",  default=10,    help="Number of bins to separate scores into, defaults to deciles (10)");
        parser.add_option("-f", "--figure",  dest="figure",  help="If set, will also try to auto-generate an example figure and store to a file here");

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        if len(args) > 1:
            inputFilename = args[0];
            inputFile = stdOpen(inputFilename);
            
            # Run the actual analysis
            analysisResults = self(inputFile, int(options.nBins));
            
            (hlStat, degFreedom, hlP) = self.calculateHosmerLemeshow(analysisResults);
            
            # Generate plot figure
            if options.figure is not None:
                self.generateFigure(analysisResults, options.figure);

            # Format the results for output
            outputFilename = None;
            if len(args) > 1:
                outputFilename = args[1];
            outputFile = stdOpen(outputFilename,"w");
            
            # Print comment line with arguments to allow for deconstruction later as well as extra results
            print >> outputFile, COMMENT_TAG, json.dumps({"argv":argv, "P-HosmerLemeshow": hlP});

            colNames = self.analysisHeaders();
            analysisResults.insert(0, RowItemModel(colNames,colNames) );    # Insert a mock record to get a header / label row
            
            formatter = TextResultsFormatter( outputFile );
            formatter.formatResultDicts( analysisResults, colNames );

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
Ejemplo n.º 7
0
def queryPatientEpisodes(outputFile, extractor):
    log.info(
        "Select patient admissions with provider category of Tt Pamf Med (Primary) or Tt Med Univ (Primary)"
    )

    conn = DBUtil.connection()
    cursor = conn.cursor()
    try:
        # # Clinical item category for admission diagnoses
        # # ADMIT_DX_CATEGORY_ID = 2;
        # admitDxCategoryId = DBUtil.execute("select clinical_item_category_id from clinical_item_category where description like '%%ADMIT_DX%%'", conn=conn)[0][0];

        # # Look for items indicating suspected infection / sepsis
        # ivAntibioticItemIds = loadIVAntibioticItemIds(extractor);
        # bloodCultureItemIds = loadBloodCultureItemIds(extractor);
        # respiratoryViralPanelItemIds = loadRespiratoryViralPanelItemIds(extractor);

        # # Merge IV antibiotics and blood cultures, respiratory panels as items that suggest sepsis is suspected
        # suspectSepsisItemIds = ivAntibioticItemIds.union(bloodCultureItemIds.union(respiratoryViralPanelItemIds));
        # suspectSepsisItemIdsStr = str.join(',', [str(itemId) for itemId in suspectSepsisItemIds]);   # Convert to comma-separated string via a str.join function on list contracture

        # # Look for primary surgery teams to exclude
        # excludeTeamCategory = "SurgerySpecialty";
        # excludeTreatmentTeams = list();
        # for row in extractor.loadMapData("TreatmentTeamGroups"):
        #     if row["team_category"] == excludeTeamCategory:
        #         excludeTreatmentTeams.append(row["treatment_team"]);
        # query = SQLQuery();
        # query.addSelect("clinical_item_id");
        # query.addFrom("clinical_item");
        # query.addWhereIn("description", excludeTreatmentTeams );
        # excludeTeamItemIds = set();
        # for row in DBUtil.execute(query, conn=conn):
        #     excludeTeamItemIds.add(row[0]);
        # excludeTeamItemIdsStr = str.join(',', [str(itemId) for itemId in excludeTeamItemIds]);   # Convert to comma-separated string via a str.join function on list contracture

        # First pass query to get the list of patients and emergency department presentation times
        cohortQuery = \
        """
        select adt1.pat_anon_id, adt1.pat_enc_csn_anon_id, adt1.shifted_transf_in_dt_tm as edAdmitTime, adt2.shifted_transf_out_dt_tm as dischargeTime
        from stride_adt as adt1, stride_adt as adt2
        where 
            adt1.pat_anon_id in
            (select patient_id from patient_item inner join clinical_item on patient_item.clinical_item_id = clinical_item.clinical_item_id where clinical_item.clinical_item_category_id = 161 AND clinical_item.description = '%s') 
        and adt1.pat_enc_csn_anon_id = adt2.pat_enc_csn_anon_id
        """ % ("Tt Pamf Med (Primary)")

        print(cohortQuery, file=sys.stderr)
        cursor.execute(cohortQuery)

        patientEpisodes = list()
        patientEpisodeById = dict()

        # Collect Build basic patient ID and
        #   ED presentation dates and Discharge date/time
        prog = ProgressDots()
        row = cursor.fetchone()
        while row is not None:
            (patientId, encounterId, edAdmitTime, dischargeTime) = row
            #patientId = int(patientId);
            patientEpisode = \
                RowItemModel \
                (   {   "patient_id":patientId,
                        "edAdmitTime":edAdmitTime,
                        "dischargeTime":dischargeTime,
                        "encounter_id":encounterId,
                        "payorTitle": None, # Default encounter data to null in case can't find it later
                        "bpSystolic": None,
                        "bpDiastolic": None,
                        "temperature": None,
                        "pulse": None,
                        "respirations": None,
                    }
                )
            patientEpisodes.append(patientEpisode)
            if patientEpisode["encounter_id"] not in patientEpisodeById:
                patientEpisodeById[
                    patientEpisode["encounter_id"]] = patientEpisode

            prog.update()
            row = cursor.fetchone()
        prog.printStatus()

        # Second query phase to link to encounter information (e.g., insurance, admitting vital signs)
        encounterIds = columnFromModelList(patientEpisodes, "encounter_id")
        query = SQLQuery()
        query.addSelect("pat_id")
        query.addSelect("pat_enc_csn_id")
        query.addSelect("title")
        query.addSelect("bp_systolic")
        query.addSelect("bp_diastolic")
        query.addSelect("temperature")
        query.addSelect("pulse")
        query.addSelect("respirations")
        query.addFrom("stride_patient_encounter")
        query.addWhereIn("pat_enc_csn_id", encounterIds)
        cursor.execute(str(query), query.params)
        row = cursor.fetchone()
        while row is not None:
            (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic,
             temperature, pulse, respirations) = row
            if encounterId in patientEpisodeById:
                patientEpisode = patientEpisodeById[encounterId]
                if patientEpisode["payorTitle"] is None:
                    patientEpisode["payorTitle"] = set()
                    # Single encounters may have multiple payors to track
                patientEpisode["payorTitle"].add(payorTitle)
                patientEpisode["bpSystolic"] = bpSystolic
                patientEpisode["bpDiastolic"] = bpDiastolic
                patientEpisode["temperature"] = temperature
                patientEpisode["pulse"] = pulse
                patientEpisode["respirations"] = respirations
            row = cursor.fetchone()

        # Drop results as tab-delimited text output
        formatter = TextResultsFormatter(outputFile)
        formatter.formatResultDicts(patientEpisodes, addHeaderRow=True)

        return patientEpisodes
    finally:
        cursor.close()
        conn.close()
Ejemplo n.º 8
0
def main(argv=None):
    timer = time.time()

    extractor = DataExtractor()

    # Final columns to output to patient matrix
    colNames = list()

    patientById = extractor.parsePatientFile(stdOpen("patients.tab"), colNames)

    log.info("Expand to index dates based start and end dates")
    patientByIndexTimeById = extractor.generateDateRangeIndexTimes(
        "firstLifeSupportDate", "lastContiguousDate",
        list(patientById.values()), colNames)

    log.info("Populate flowsheet summary statistics")
    flowsheetByNameByPatientId = extractor.parseFlowsheetFile(
        stdOpen("Flowsheet.tab.gz"))
    extractor.addFlowsheetFeatures(patientByIndexTimeById,
                                   flowsheetByNameByPatientId, FLOWSHEET_NAMES,
                                   FLOWSHEET_PRE_TIME_DELTA,
                                   FLOWSHEET_POST_TIME_DELTA, colNames)

    log.info("Populate laboratory result summary statistics")
    labsByBaseNameByPatientId = extractor.parseLabResultsFile(
        stdOpen("LabResults.tab.gz"))
    extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId,
                             LAB_BASE_NAMES, LAB_PRE_TIME_DELTA,
                             LAB_POST_TIME_DELTA, colNames)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")),
        patientByIndexTimeById, colNames, "AnyICULifeSupport")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")),
        patientByIndexTimeById, colNames, "AnyDNR")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")),
        patientByIndexTimeById, colNames, "AnyVasoactive")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")),
        patientByIndexTimeById, colNames, "AnyCRRT")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")),
        patientByIndexTimeById, colNames, "AnyVentilator")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")),
        patientByIndexTimeById, colNames, "ComfortCare")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")),
        patientByIndexTimeById, colNames, "PalliativeConsult")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Death.tab")),
        patientByIndexTimeById, colNames, "Death")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Birth.tab")),
        patientByIndexTimeById, colNames, "Birth")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Male.tab")),
        patientByIndexTimeById, colNames, "Male")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Female.tab")),
        patientByIndexTimeById, colNames, "Female")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteNonHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")),
        patientByIndexTimeById, colNames, "RaceAsian")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")),
        patientByIndexTimeById, colNames, "RaceHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")),
        patientByIndexTimeById, colNames, "RaceUnknown")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")),
        patientByIndexTimeById, colNames, "RaceOther")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")),
        patientByIndexTimeById, colNames, "RaceBlack")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")),
        patientByIndexTimeById, colNames, "RacePacificIslander")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")),
        patientByIndexTimeById, colNames, "RaceNativeAmerican")

    log.info(
        "Systemically Scan for Charlson comorbidities and Treatment Team categories"
    )
    for filename in os.listdir("."):
        if filename.startswith(CHARLSON_PREFIX):
            diseaseName = filename
            if filename.endswith(".tab"):
                diseaseName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, diseaseName)

        if filename.startswith(TREATMENT_TEAM_PREFIX):
            teamName = filename
            if filename.endswith(".tab"):
                teamName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, teamName)

    log.info("Output feature matrix file with row per patient day")
    featureMatrixFile = stdOpen("featureMatrix.ICUDNR.tab.gz", "w")
    formatter = TextResultsFormatter(featureMatrixFile)
    for patientId, patientByIndexTime in patientByIndexTimeById.items():
        patientResults = list(patientByIndexTime.values())
        formatter.formatResultDicts(patientResults,
                                    colNames,
                                    addHeaderRow=True)

    timer = time.time() - timer
    print("%.3f seconds to complete" % timer, file=sys.stderr)
Ejemplo n.º 9
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format.  Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            default=DEFAULT_RECOMMENDED_ITEM_COUNT,
            help=
            "Number of orders / items to recommend for comparison against the verification set, sorted in prevalence order.  If skip or set <1, then will use all order set items found."
        )
        parser.add_option(
            "-O",
            "--numRecsByOrderSet",
            dest="numRecsByOrderSet",
            action="store_true",
            help=
            "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            default=DEFAULT_SORT_FIELD,
            help=
            "Allow overriding of default sort field when returning ranked results (patient_count, name, description, etc.)"
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            query = AnalysisQuery()
            query.preparedPatientItemFile = stdOpen(args[0])
            query.recommender = OrderSetRecommender()
            query.baseRecQuery = RecommenderQuery()
            # Default exclusions if none specified
            query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
            )
            query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
            )
            query.baseRecQuery.sortField = options.sortField
            query.numRecommendations = int(options.numRecs)
            query.numRecsByOrderSet = options.numRecsByOrderSet

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 10
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <recommenderName> <patientIds> [<outputFile>]\n"+\
                    "   <patientIds/dataFile>    Name of file with patient ids.  If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for.  Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\
                    "   <outputFile>    If query yields a result set, then that will be output\n"+\
                    "                       to the named file.  Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-c",
            "--baseCategoryId",
            dest="baseCategoryId",
            help=
            "ID of clinical item category to look for initial items from (probably the ADMIT Dx item)."
        )
        parser.add_option(
            "-Q",
            "--queryTimeSpan",
            dest="queryTimeSpan",
            help=
            "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above.  Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)."
        )
        parser.add_option(
            "-o",
            "--outcomeItemIds",
            dest="outcomeItemIds",
            help=
            "Comma separated list of outcome item IDs to get prediction / recommendation scores for, as well as to label whether they actually appeared for the given patients.  Can specify virtual items representing the end of item triples (e.g., 5-Readmission being the end of any item followed by 3591-Discharge then 3671-Admit), by adding the component items in expected sequence.  For example, '5=3591:3671'"
        )
        parser.add_option(
            "-t",
            "--timeDeltaMax",
            dest="timeDeltaMax",
            help=
            "Time delta in seconds maximum by which recommendations should be based on.  Defaults to recommending items that occur at ANY time after the key orders.  If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items.  If set, will also only count presence of labeled target items if occurs within the given time delta of the first query item."
        )

        parser.add_option(
            "-P",
            "--preparedPatientItemFile",
            dest="preparedPatientItemFile",
            action="store_true",
            help=
            "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database."
        )

        parser.add_option(
            "-R",
            "--recommender",
            dest="recommender",
            help=
            "Name of the recommender to run the analysis against.  Options: %s"
            % list(RECOMMENDER_CLASS_BY_NAME.keys()))
        parser.add_option(
            "-S",
            "--scoreField",
            dest="scoreField",
            help=
            "Name of (derived) field to score items by.  For example, 'conditionalFreq.'"
        )
        parser.add_option(
            "-p",
            "--countPrefix",
            dest="countPrefix",
            help=
            "Which counting method to use for item associations.  Defaults to counting item occurrences, allowing for duplicates.  Additional options include: %s."
            % list(COUNT_PREFIX_OPTIONS))
        parser.add_option(
            "-a",
            "--aggregationMethod",
            dest="aggregationMethod",
            help=
            "Aggregation method to use for recommendations based off multiple query items.  Options: %s."
            % list(AGGREGATOR_OPTIONS))
        parser.add_option(
            "-s",
            "--skipIfOutcomeInQuery",
            dest="skipIfOutcomeInQuery",
            action="store_true",
            help=
            "If set, will skip patients where the outcome item occurs during the query period since that would defy the point of predicting the outcome."
        )
        parser.add_option(
            "-m",
            "--maxRecommendedId",
            dest="maxRecommendedId",
            help=
            "Specify a maximum ID value to accept for recommended items.  More used to limit output in test cases"
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            # Parse out the query parameters
            query = AnalysisQuery()
            query.recommender = RECOMMENDER_CLASS_BY_NAME[
                options.recommender]()
            query.recommender.dataManager.dataCache = dict()
            # Use local cache to speed up repeat queries

            query.baseRecQuery = RecommenderQuery()
            if options.preparedPatientItemFile:
                # Don't reconstruct validation data through database, just read off validation file
                query.preparedPatientItemFile = stdOpen(args[0])
            else:

                patientIdsParam = args[0]
                try:
                    # Try to open patient IDs as a file
                    patientIdFile = stdOpen(patientIdsParam)
                    query.patientIds = set(patientIdFile.read().split())
                except IOError:
                    # Unable to open as a filename, then interpret as simple comma-separated list
                    query.patientIds = set(patientIdsParam.split(","))

                query.baseCategoryId = int(options.baseCategoryId)
                # Category to look for clinical item to start accruing query items from
                query.queryTimeSpan = timedelta(0, int(options.queryTimeSpan))

                query.baseRecQuery.targetItemIds = set()

                outcomeIdStrList = options.outcomeItemIds.split(",")
                for outcomeIdStr in outcomeIdStrList:
                    outcomeIdComponents = outcomeIdStr.split("=")
                    outcomeId = int(outcomeIdComponents[0])
                    query.baseRecQuery.targetItemIds.add(outcomeId)
                    if len(outcomeIdComponents) > 1:
                        sequenceIds = [
                            int(seqIdStr)
                            for seqIdStr in outcomeIdComponents[1].split(":")
                        ]
                        query.sequenceItemIdsByVirtualItemId[
                            outcomeId] = tuple(sequenceIds)

            if options.timeDeltaMax is not None:
                query.baseRecQuery.timeDeltaMax = timedelta(
                    0, int(options.timeDeltaMax))
            if options.scoreField is not None:
                query.baseRecQuery.sortField = options.scoreField
            if options.countPrefix is not None:
                query.baseRecQuery.countPrefix = options.countPrefix
            if options.aggregationMethod is not None:
                query.baseRecQuery.aggregationMethod = options.aggregationMethod
            if options.maxRecommendedId is not None:
                query.baseRecQuery.maxRecommendedId = int(
                    options.maxRecommendedId)

            if options.skipIfOutcomeInQuery is not None:
                query.skipIfOutcomeInQuery = options.skipIfOutcomeInQuery

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile)

            colNames = self.analysisHeaders(query)
            analysisResults.insert(0, RowItemModel(colNames, colNames))
            # Insert a mock record to get a header / label row

            formatter = TextResultsFormatter(outputFile)
            formatter.formatResultDicts(analysisResults, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 11
0
def queryPatients(outputFile):
    log.info("Select patients with any ICU life support orders and follow contiguous date trail for apparent hospitalization (long query >20 min)...");
    
    conn = DBUtil.connection();
    cursor = conn.cursor();
    try:
        anyLifeSupportItemId = DBUtil.execute("select clinical_item_id from clinical_item where name = 'AnyICULifeSupport'", conn=conn)[0][0];

        patientById = dict();
        query = \
            """select pi.patient_id, date_trunc('day',pi.item_date), min(pi.encounter_id), count(pi.patient_item_id)
            from patient_item as pi,
            (
                select pi2.patient_id, min(pi2.item_date) as firstLifeSupportDate
                    from patient_item as pi2
                    where pi2.clinical_item_id = %s
                    group by pi2.patient_id
            ) as piX
            where pi.patient_id = piX.patient_id
            and pi.item_date >= piX.firstLifeSupportDate
            group by pi.patient_id, date_trunc('day',pi.item_date)
            order by pi.patient_id, date_trunc('day',pi.item_date)
            """ % anyLifeSupportItemId;
        cursor.execute(query);

        row = cursor.fetchone();
        while row is not None:
            (patientId, itemDate, encounterId, itemCount) = row;
            patientId = int(patientId);
            if patientId not in patientById:
                patientById[patientId] = \
                    RowItemModel \
                    (   {   "patient_id":patientId, 
                            "firstLifeSupportDate":itemDate, 
                            "lastContiguousDate":itemDate, 
                            "encounter_id":encounterId, # Assumes single value that won't be overwritten
                            "payorTitle": None, # Default encounter data to null in case can't find it later
                            "bpSystolic": None,
                            "bpDiastolic": None,
                            "temperature": None,
                            "pulse": None,
                            "respirations": None,
                        }
                    );
            if (itemDate - patientById[patientId]["lastContiguousDate"]) <= CONTIGUOUS_THRESHOLD:
                patientById[patientId]["lastContiguousDate"] = itemDate;
            if patientById[patientId]["encounter_id"] is None:
                patientById[patientId]["encounter_id"] = encounterId;
            row = cursor.fetchone();

        # Second query phase to link to encounter information (e.g., insurance, admitting vital signs)
        encounterIds = columnFromModelList(iter(patientById.values()), "encounter_id");
        query = SQLQuery();
        query.addSelect("pat_id");
        query.addSelect("pat_enc_csn_id");
        query.addSelect("title");
        query.addSelect("bp_systolic");
        query.addSelect("bp_diastolic");
        query.addSelect("temperature");
        query.addSelect("pulse");
        query.addSelect("respirations");
        query.addFrom("stride_patient_encounter");
        query.addWhereIn("pat_enc_csn_id", encounterIds);
        cursor.execute(str(query), query.params);
        row = cursor.fetchone();
        while row is not None:
            (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic, temperature, pulse, respirations) = row;
            if patientById[patientId]["payorTitle"] is None:
                patientById[patientId]["payorTitle"] = set();   # Single encounters may have multiple payors to track
            patientById[patientId]["payorTitle"].add(payorTitle);
            patientById[patientId]["bpSystolic"] = bpSystolic;
            patientById[patientId]["bpDiastolic"] = bpDiastolic;
            patientById[patientId]["temperature"] = temperature;
            patientById[patientId]["pulse"] = pulse;
            patientById[patientId]["respirations"] = respirations;
            row = cursor.fetchone();
        
        if patientById[patientId]["payorTitle"] is not None:    # Condense to single string
            payorList = list(patientById[patientId]["payorTitle"]);
            payorList.sort();
            patientById[patientId]["payorTitle"] = str.join(",", payorList);
        
        # Drop results as tab-delimited text output
        formatter = TextResultsFormatter(outputFile);
        formatter.formatResultDicts(iter(patientById.values()), addHeaderRow=True);

        return patientById;    
    finally:
        cursor.close();
        conn.close();
Ejemplo n.º 12
0
def queryPatientEpisodes(outputFile, extractor):
    log.info("Select patient admissions with possible/probable sepsis within 24 hours of admission (long query >60 min?)...");

    conn = DBUtil.connection();
    cursor = conn.cursor();
    try:
        # Clinical item category for admission diagnoses
        # ADMIT_DX_CATEGORY_ID = 2;
        admitDxCategoryId = DBUtil.execute("select clinical_item_category_id from clinical_item_category where description like '%%ADMIT_DX%%'", conn=conn)[0][0];

        # Look for items indicating suspected infection / sepsis
        ivAntibioticItemIds = loadIVAntibioticItemIds(extractor);
        bloodCultureItemIds = loadBloodCultureItemIds(extractor);
        respiratoryViralPanelItemIds = loadRespiratoryViralPanelItemIds(extractor);

        # Merge IV antibiotics and blood cultures, respiratory panels as items that suggest sepsis is suspected
        suspectSepsisItemIds = ivAntibioticItemIds.union(bloodCultureItemIds.union(respiratoryViralPanelItemIds));
        suspectSepsisItemIdsStr = str.join(',', [str(itemId) for itemId in suspectSepsisItemIds]);   # Convert to comma-separated string via a str.join function on list contracture

        # Look for primary surgery teams to exclude
        excludeTeamCategory = "SurgerySpecialty";
        excludeTreatmentTeams = list();
        for row in extractor.loadMapData("TreatmentTeamGroups"):
            if row["team_category"] == excludeTeamCategory:
                excludeTreatmentTeams.append(row["treatment_team"]);
        query = SQLQuery();
        query.addSelect("clinical_item_id");
        query.addFrom("clinical_item");
        query.addWhereIn("description", excludeTreatmentTeams );
        excludeTeamItemIds = set();
        for row in DBUtil.execute(query, conn=conn):
            excludeTeamItemIds.add(row[0]);
        excludeTeamItemIdsStr = str.join(',', [str(itemId) for itemId in excludeTeamItemIds]);   # Convert to comma-separated string via a str.join function on list contracture

        # First pass query to get the list of patients and emergency department presentation times
        cohortQuery = \
        """
        --  Pick out date(s) when admitted through emergency department and matching discharge time
        select adt1.pat_anon_id, adt1.pat_enc_csn_anon_id, adt1.shifted_transf_in_dt_tm as edAdmitTime, adt2.shifted_transf_out_dt_tm as dischargeTime
        from stride_adt as adt1, stride_adt as adt2
        where 
            -- Admission event
            adt1.department_in = 'EMERGENCY DEPARTMENT' and
            adt1.event_in = 'Admission' and
            adt1.pat_anon_id in
            (    -- Select any patient with any suspected sepsis related order (i.e., IV antibiotics or blood cultures)
                select patient_id
                from patient_item as pi
                where pi.clinical_item_id in (%s)
                except
                -- Exclude any patient who has been on a primary surgery team
                select patient_id
                from patient_item
                where clinical_item_id in (%s)
                -- -12434586418575,-12432455207729,-12428492282572,-12428492282572,-12424048595257,-12414081679705
            ) and
            
            adt1.pat_enc_csn_anon_id = adt2.pat_enc_csn_anon_id and
            
            -- Discharge event
            adt2.event_out = 'Discharge'
            
        order by adt1.shifted_transf_in_dt_tm
        """ % (suspectSepsisItemIdsStr, excludeTeamItemIdsStr);
        print >> sys.stderr, cohortQuery;
        cursor.execute(cohortQuery);

        patientEpisodes = list();
        patientEpisodeById = dict();

        # Collect Build basic patient ID and 
        #   ED presentation dates and Discharge date/time
        prog = ProgressDots();
        row = cursor.fetchone();
        while row is not None:
            (patientId, encounterId, edAdmitTime, dischargeTime) = row;
            #patientId = int(patientId);
            patientEpisode = \
                RowItemModel \
                (   {   "patient_id":patientId, 
                        "edAdmitTime":edAdmitTime, 
                        "dischargeTime":dischargeTime, 
                        "encounter_id":encounterId,
                        "payorTitle": None, # Default encounter data to null in case can't find it later
                        "bpSystolic": None,
                        "bpDiastolic": None,
                        "temperature": None,
                        "pulse": None,
                        "respirations": None,
                    }
                );
            patientEpisodes.append(patientEpisode);
            if patientEpisode["encounter_id"] not in patientEpisodeById:
                patientEpisodeById[patientEpisode["encounter_id"]] = patientEpisode;

            prog.update();
            row = cursor.fetchone();
        prog.printStatus();

        # Second query phase to link to encounter information (e.g., insurance, admitting vital signs)
        encounterIds = columnFromModelList(patientEpisodes, "encounter_id");
        query = SQLQuery();
        query.addSelect("pat_id");
        query.addSelect("pat_enc_csn_id");
        query.addSelect("title");
        query.addSelect("bp_systolic");
        query.addSelect("bp_diastolic");
        query.addSelect("temperature");
        query.addSelect("pulse");
        query.addSelect("respirations");
        query.addFrom("stride_patient_encounter");
        query.addWhereIn("pat_enc_csn_id", encounterIds);
        cursor.execute(str(query), query.params);
        row = cursor.fetchone();
        while row is not None:
            (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic, temperature, pulse, respirations) = row;
            if encounterId in patientEpisodeById:
                patientEpisode = patientEpisodeById[encounterId];
                if patientEpisode["payorTitle"] is None:
                    patientEpisode["payorTitle"] = set();   # Single encounters may have multiple payors to track
                patientEpisode["payorTitle"].add(payorTitle);
                patientEpisode["bpSystolic"] = bpSystolic;
                patientEpisode["bpDiastolic"] = bpDiastolic;
                patientEpisode["temperature"] = temperature;
                patientEpisode["pulse"] = pulse;
                patientEpisode["respirations"] = respirations;
            row = cursor.fetchone();
        
        # Drop results as tab-delimited text output
        formatter = TextResultsFormatter(outputFile);
        formatter.formatResultDicts(patientEpisodes, addHeaderRow=True);

        return patientEpisodes;
    finally:
        cursor.close();
        conn.close();
Ejemplo n.º 13
0
    def test_performance(self):
        """
        Test performance against DataExtractor.
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Initialize FeatureMatrixFactory.
        factoryStart = time.time()
        self.factory = FeatureMatrixFactory()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Add clinical item features.
        self.factory.addClinicalItemFeatures(["PerfItem300"])
        self.factory.addClinicalItemFeatures(["PerfItem400"])
        self.factory.addClinicalItemFeatures(["PerfItem500"])

        # Add lab result features.
        self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta,
                                          postTimeDelta)

        # Add flowsheet features.
        self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta,
                                          postTimeDelta)

        # Build matrix.
        self.factory.buildFeatureMatrix()

        # Stop timer.
        factoryStop = time.time()

        # Initialize DataExtractor.
        extractorStart = time.time()
        extractor = DataExtractor()
        extractor.dataCache = dict()

        # Initialize output file.
        outFile = open("extractor.feature_matrix.tab.gz", "w")
        formatter = TextResultsFormatter(outFile)

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Process patient episodes.
        patientEpisodes = list()
        row = cursor.fetchone()

        while row is not None:
            (pat_id, order_proc_id, proc_code, order_time,
             normal_results) = row
            patientEpisode = \
                RowItemModel \
                (
                    {
                        "patient_id": pat_id,
                        "order_proc_id": order_proc_id,
                        "proc_code": proc_code,
                        "order_time": order_time,
                        "result_normal_count": normal_results
                    }
                )
            patientEpisodes.append(patientEpisode)
            row = cursor.fetchone()

        # Initialize patient data.
        lastPatientId = None
        colNames = None
        patientEpisodeByIndexTime = None

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Populate patient data.
        tempColNames = \
            ["patient_id", "order_proc_id", "proc_code", "order_time",
                "result_normal_count"]
        for patientEpisode in patientEpisodes:
            patientId = patientEpisode["patient_id"]

            if lastPatientId is not None and lastPatientId != patientId:
                # New patient ID so start querying for patient specific data and
                # populating patient episode data.

                # Clinical Item (PerfItem300)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem300",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem400)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem400",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem500)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem500",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Lab Result (Foo)
                labResultTable = extractor.queryLabResults(["Foo"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Bar)
                labResultTable = extractor.queryLabResults(["Bar"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Baz)
                labResultTable = extractor.queryLabResults(["Baz"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Qux)
                labResultTable = extractor.queryLabResults(["Qux"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
                    preTimeDelta, postTimeDelta))

                # Flowsheet (Perflow)
                # tempFile = StringIO()
                # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile)
                # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\
                #     StringIO(tempFile.getvalue()))
                # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\
                #     patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \
                #     ["Perflow"], preTimeDelta, postTimeDelta, tempColNames))

                if colNames is None:
                    # First row, print header row
                    colNames = tempColNames
                    formatter.formatTuple(colNames)

                # Print out patient (episode) data (one row per episode)
                formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                            colNames)

            if lastPatientId is None or lastPatientId != patientId:
                # Prepare to aggregate patient episode record per patient
                patientEpisodeByIndexTime = dict()

            patientEpisodeByIndexTime[
                patientEpisode["order_time"]] = patientEpisode
            lastPatientId = patientId
            outFile.flush()

        # Last Iteration
        patientId = lastPatientId
        # Clinical Item (PerfItem300)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem300",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem400)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem400",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem500)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem500",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Lab Result (Foo)
        labResultTable = extractor.queryLabResults(["Foo"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Bar)
        labResultTable = extractor.queryLabResults(["Bar"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Baz)
        labResultTable = extractor.queryLabResults(["Baz"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Qux)
        labResultTable = extractor.queryLabResults(["Qux"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
            preTimeDelta, postTimeDelta))

        formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                    colNames)

        # Close file.
        outFile.close()

        # Stop timer.
        extractorStop = time.time()

        # Compare results.
        factoryTime = factoryStop - factoryStart
        extractorTime = extractorStop - extractorStart
        self.assertTrue(extractorTime > factoryTime)

        # Clean up feature matrix files.
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except OSError:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
Ejemplo n.º 14
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format.  Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-X",
            "--excludeCategoryIds",
            dest="excludeCategoryIds",
            help=
            "For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            default=DEFAULT_SORT_FIELD,
            help=
            "Score field to sort top recommendations by.  Default to posterior probabilty / positive predictive value 'P(B|A)', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting."
        )
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            default=DEFAULT_RECOMMENDED_ITEM_COUNT,
            help=
            "Number of orders / items to recommend for comparison against the verification set."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            query = AnalysisQuery()
            query.preparedPatientItemFile = stdOpen(args[0])
            query.recommender = OrderSetRecommender()
            query.baseRecQuery = RecommenderQuery()
            if options.excludeCategoryIds is not None:
                query.baseRecQuery.excludeCategoryIds = set()
                for categoryIdStr in options.executeCategoryIds.split(","):
                    query.baseRecQuery.excludeCategoryIds.add(
                        int(categoryIdStr))
            else:  # Default exclusions if none specified
                query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
                )
                query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
                )

            query.baseRecQuery.sortField = options.sortField
            query.numRecommendations = int(options.numRecs)

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 15
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <scoreFile> <outcomeFile> [<outputFile>]\n"+\
                    "   <scoreFile>     Tab-delimited file containing item IDs as well as column(s) for some score that will be used to relate to outcomes\n"+\
                    "   <outcomeFile>   Tab-delimited file containing item IDs as well as value column to assess for outcome labeling\n"+\
                    "   <outputFile>    Tab-delimited file matching score file with extra outcome column\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-c",
                          "--linkCol",
                          dest="linkCol",
                          help="Name of column to link score and outcome file")
        parser.add_option("-o",
                          "--outcomeLabel",
                          dest="outcomeLabel",
                          help="Label to set for new outcome column")
        parser.add_option("-v",
                          "--valueMin",
                          dest="valueMin",
                          help="Minimum value to treat as a positive outcome")
        parser.add_option("-V",
                          "--valueMax",
                          dest="valueMax",
                          help="Maximum value to treat as a positive outcome")
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 1:
            scoreFile = stdOpen(args[0])
            outcomeFile = stdOpen(args[1])

            valueMin = float(options.valueMin)
            valueMax = float(options.valueMax)

            # Run the actual analysis
            outputDicts = self(scoreFile,
                               outcomeFile,
                               options.linkCol,
                               options.outcomeLabel,
                               valueMin,
                               valueMax,
                               generateHeader=True)

            # Format the results for output
            outputFilename = None
            if len(args) > 2:
                outputFilename = args[2]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            formatter = TextResultsFormatter(outputFile)
            formatter.formatResultDicts(outputDicts)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 16
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile> Tab-delimited file with columns representing score(s) and labeled outcome(s)\n"+\
                    "   <outputFile> Tab-delimited file with column for number of top items considered up to maxItems and then a column for each axes of interest specified.\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-f",
            "--figure",
            dest="figure",
            help="If set, generate an example figure to the named file")
        parser.add_option("-t",
                          "--title",
                          dest="title",
                          help="Title caption to apply to generated figure")
        parser.add_option(
            "-r",
            "--rcParams",
            dest="rcParams",
            help=
            "JSON dictionary format string specifying any MatPlotLib RC Params to use when generating figure.  For example: \"{\\\"axes.titlesize\\\":16,\\\"axes.labelsize\\\":16,\\\"legend.fontsize\\\"':16,\\\"figure.figsize\\\":[4,3],\\\"annotation.size\\\":14}\".  For more info, see http://matplotlib.org/users/customizing.html "
        )
        parser.add_option(
            "-l",
            "--labelIndexes",
            dest="labelIndexes",
            help=
            "Comma-separated list of indexes at which to add data-label points to the generated figure"
        )
        parser.add_option(
            "-c",
            "--cycleLineStyle",
            dest="cycleLineStyle",
            action="store_true",
            help=
            "If set, will reuse colors, but vary line-styles for multiple plots.  Default is cycle through colors only."
        )
        parser.add_option(
            "-m",
            "--maxItems",
            dest="maxItems",
            help=
            "If set, maximum number of top items to consider in the accuracy plots"
        )
        parser.add_option("-o",
                          "--colOutcome",
                          dest="colOutcome",
                          help="Name of column to look for outcome values.")
        parser.add_option(
            "-x",
            "--axes",
            dest="axes",
            help=
            "Comma-separated list of colon-separated metrics to plot.  For example, recall:score1,precision:score2 will plot recall vs. top items scored by score 1 and precision vs. top items scored by score2."
        )

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            summaryData = {
                "argv": argv
            }

            inputFilename = args[0]
            inputFile = stdOpen(inputFilename)

            maxItems = int(options.maxItems)

            # Parse out the metrics to plot and score columns to sort by
            metricsByScoreCol = dict()
            for metric2scoreStr in options.axes.split(VALUE_DELIM):
                (metric, scoreCol) = metric2scoreStr.split(AXIS_DELIM)
                if scoreCol not in metricsByScoreCol:
                    metricsByScoreCol[scoreCol] = set()
                metricsByScoreCol[scoreCol].add(metric)

            # Run the actual analysis
            resultDicts = self(inputFile, options.colOutcome,
                               metricsByScoreCol, maxItems)

            # Generate plot figure
            if options.figure is not None:
                self.generateFigure(resultDicts, summaryData, options)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)
            # Insert a header row
            resultDicts.insert(
                0,
                RowItemModel(list(resultDicts[0].keys()),
                             list(resultDicts[0].keys())))

            formatter = TextResultsFormatter(outputFile)
            formatter.formatResultDicts(resultDicts)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 17
0
def main(argv=None):
    timer = time.time()

    extractor = DataExtractor()

    # Output file
    featureMatrixFile = stdOpen("featureMatrix.SepsisICU.encounters.tab.gz",
                                "w")

    # Final columns to output to patient matrix
    colNames = list()

    patientEpisodes = extractor.parsePatientEpisodeFile(
        stdOpen("patientEpisodes.tab"), colNames)
    #patientIds = set(columnFromModelList(patientEpisodes, "patient_id"));

    log.info("Expand to index dates based start and end dates")
    # But only want one entry per patient
    patientByIndexTimeById = extractor.generateDateRangeIndexTimes(
        "edAdmitTime",
        "dischargeTime",
        patientEpisodes,
        colNames,
        timeInterval=None)

    log.info("Populate flowsheet summary statistics")
    flowsheetByNameByPatientId = extractor.parseFlowsheetFile(
        stdOpen("Flowsheet.tab.gz"))
    extractor.addFlowsheetFeatures(patientByIndexTimeById,
                                   flowsheetByNameByPatientId, FLOWSHEET_NAMES,
                                   FLOWSHEET_PRE_TIME_DELTA,
                                   FLOWSHEET_POST_TIME_DELTA, colNames)

    log.info("Populate laboratory result summary statistics")
    labsByBaseNameByPatientId = extractor.parseLabResultsFile(
        stdOpen("LabResults.tab.gz"))
    extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId,
                             LAB_BASE_NAMES, LAB_PRE_TIME_DELTA,
                             LAB_POST_TIME_DELTA, colNames)

    log.info("Populate IV Fluid accumulation")
    ivFluidsByPatientId = extractor.parseIVFluidFile(
        stdOpen("IsotonicIVFluids.tab.gz"))
    extractor.addIVFluidFeatures(patientByIndexTimeById, ivFluidsByPatientId,
                                 IVF_THRESHOLD_VOLUMES, IVF_CHECKPOINT_TIMES,
                                 colNames)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("IVAntibiotic.tab")),
        patientByIndexTimeById, colNames, "IVAntibiotic")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("BloodCulture.tab")),
        patientByIndexTimeById, colNames, "BloodCulture")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RespViralPanel.tab")),
        patientByIndexTimeById, colNames, "RespViralPanel")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")),
        patientByIndexTimeById, colNames, "AnyICULifeSupport")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")),
        patientByIndexTimeById, colNames, "AnyDNR")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")),
        patientByIndexTimeById, colNames, "AnyVasoactive")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")),
        patientByIndexTimeById, colNames, "AnyCRRT")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")),
        patientByIndexTimeById, colNames, "AnyVentilator")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")),
        patientByIndexTimeById, colNames, "ComfortCare")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")),
        patientByIndexTimeById, colNames, "PalliativeConsult")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Death.tab")),
        patientByIndexTimeById, colNames, "Death")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Birth.tab")),
        patientByIndexTimeById, colNames, "Birth")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Male.tab")),
        patientByIndexTimeById, colNames, "Male")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Female.tab")),
        patientByIndexTimeById, colNames, "Female")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteNonHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")),
        patientByIndexTimeById, colNames, "RaceAsian")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")),
        patientByIndexTimeById, colNames, "RaceHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")),
        patientByIndexTimeById, colNames, "RaceUnknown")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")),
        patientByIndexTimeById, colNames, "RaceOther")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")),
        patientByIndexTimeById, colNames, "RaceBlack")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")),
        patientByIndexTimeById, colNames, "RacePacificIslander")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")),
        patientByIndexTimeById, colNames, "RaceNativeAmerican")

    log.info(
        "Systemically Scan for Charlson comorbidities and Treatment Team categories"
    )
    for filename in os.listdir("."):
        if filename.startswith(CHARLSON_PREFIX):
            diseaseName = filename
            if filename.endswith(".tab"):
                diseaseName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, diseaseName)

        if filename.startswith(TREATMENT_TEAM_PREFIX):
            teamName = filename
            if filename.endswith(".tab"):
                teamName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, teamName)

    log.info("Output feature matrix file with row per patient day")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatTuple(colNames)
    for patientId, patientByIndexTime in patientByIndexTimeById.iteritems():
        patientResults = patientByIndexTime.values()
        formatter.formatResultDicts(patientResults, colNames)

    timer = time.time() - timer
    print >> sys.stderr, "%.3f seconds to complete" % timer
Ejemplo n.º 18
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <patientIds/dataFile> [<outputFile>]\n"+\
                    "   <patientIds/dataFile>    Name of file with patient ids.  If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for.  Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\
                    "   <outputFile>    If query yields a result set, then that will be output\n"+\
                    "                       to the named file.  Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-q",
            "--numQuery",
            dest="numQuery",
            help=
            "Number of orders / items from each patient to use as query items to prime the recommendations.  If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items"
        )
        parser.add_option(
            "-v",
            "--numVerify",
            dest="numVerify",
            help=
            "Number of orders / items from each patient after the query items to use to validate recommendations.  If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items.  If left unset, then just use all remaining orders / items for that patient"
        )
        parser.add_option(
            "-c",
            "--baseCategoryId",
            dest="baseCategoryId",
            help=
            "Instead of specifying first nQ query items, specify ID of clinical item category to look for initial items from (probably the ADMIT Dx item)."
        )
        parser.add_option(
            "-b",
            "--baseItemId",
            dest="baseItemId",
            help=
            "Instead of specifying first nQ query items, specify ID of the specific clinical item to look for initial items from."
        )
        parser.add_option(
            "-S",
            "--startDate",
            dest="startDate",
            help="Only look for test data occuring on or after this start date."
        )
        parser.add_option(
            "-E",
            "--endDate",
            dest="endDate",
            help="Only look for test data occuring before this end date.")
        parser.add_option(
            "-Q",
            "--queryTimeSpan",
            dest="queryTimeSpan",
            help=
            "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above.  Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)."
        )
        parser.add_option(
            "-V",
            "--verifyTimeSpan",
            dest="verifyTimeSpan",
            help=
            "Time frame specified in seconds over which to look for verify items after initial query item time.  Will ignore the query items that occur within the queryTimeSpan."
        )

        parser.add_option(
            "-P",
            "--preparedPatientItemFile",
            dest="preparedPatientItemFile",
            action="store_true",
            help=
            "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database."
        )

        parser.add_option(
            "-R",
            "--recommender",
            dest="recommender",
            help=
            "Name of the recommender to run the analysis against.  Options: %s"
            % list(RECOMMENDER_CLASS_BY_NAME.keys()))
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            help=
            "Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size."
        )
        parser.add_option(
            "-O",
            "--numRecsByOrderSet",
            dest="numRecsByOrderSet",
            action="store_true",
            help=
            "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            help=
            "Allow overriding of default sort field when returning ranked results"
        )
        parser.add_option(
            "-f",
            "--fieldFilters",
            dest="fieldFilters",
            help=
            "Filters to exclude results.  Comma-separated separated list of field-op:value exclusions where op is either < or > like, conditionalFreq<:0.1,frqeRatio<:1"
        )
        parser.add_option(
            "-t",
            "--timeDeltaMax",
            dest="timeDeltaMax",
            help=
            "If set, represents a time delta in seconds maximum by which recommendations should be based on.  Defaults to recommending items that occur at ANY time after the key orders.  If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items."
        )
        parser.add_option(
            "-a",
            "--aggregationMethod",
            dest="aggregationMethod",
            help=
            "Aggregation method to use for recommendations based off multiple query items.  Options: %s."
            % list(AGGREGATOR_OPTIONS))
        parser.add_option(
            "-p",
            "--countPrefix",
            dest="countPrefix",
            help=
            "Prefix for how to do counts.  Blank for default item counting allowing repeats, otherwise ignore repeats for patient_ or encounter_"
        )
        parser.add_option(
            "-m",
            "--maxRecommendedId",
            dest="maxRecommendedId",
            help=
            "Specify a maximum ID value to accept for recommended items.  More used to limit output in test cases"
        )

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            # Parse out the query parameters
            query = AnalysisQuery()
            query.recommender = RECOMMENDER_CLASS_BY_NAME[
                options.recommender]()
            query.recommender.dataManager.dataCache = dict()
            # Use a dataCache to facilitate repeat queries

            if options.preparedPatientItemFile:
                # Don't reconstruct validation data through database, just read off validation file
                query.preparedPatientItemFile = stdOpen(args[0])
            else:
                patientIdsParam = args[0]
                try:
                    # Try to open patient IDs as a file
                    patientIdFile = stdOpen(patientIdsParam)
                    query.patientIds = set(patientIdFile.read().split())
                except IOError:
                    # Unable to open as a filename, then interpret as simple comma-separated list
                    query.patientIds = set(patientIdsParam.split(","))

                if options.numQuery is not None:
                    query.numQueryItems = int(options.numQuery)
                    query.numVerifyItems = int(options.numVerify)
                else:
                    # Alternative to specify query time span starting from a key category
                    query.queryTimeSpan = timedelta(0,
                                                    int(options.queryTimeSpan))
                    query.verifyTimeSpan = timedelta(
                        0, int(options.verifyTimeSpan))

                if options.baseCategoryId is not None or options.baseItemId is not None:
                    if options.baseCategoryId is not None:
                        query.baseCategoryId = int(options.baseCategoryId)
                        # Category to look for clinical item to start accruing query items from
                    if options.baseItemId is not None:
                        query.baseItemId = int(options.baseItemId)

                if options.startDate is not None:
                    query.startDate = DBUtil.parseDateValue(options.startDate)
                if options.endDate is not None:
                    query.endDate = DBUtil.parseDateValue(options.endDate)

            query.baseRecQuery = RecommenderQuery()
            query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
            )
            query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
            )
            if options.timeDeltaMax is not None and len(
                    options.timeDeltaMax) > 0:
                query.baseRecQuery.timeDeltaMax = timedelta(
                    0, int(options.timeDeltaMax))
            if options.aggregationMethod is not None:
                query.baseRecQuery.aggregationMethod = options.aggregationMethod
            if options.countPrefix is not None:
                query.baseRecQuery.countPrefix = options.countPrefix
            if options.maxRecommendedId is not None:
                query.baseRecQuery.maxRecommendedId = int(
                    options.maxRecommendedId)
            if options.sortField is not None:
                query.baseRecQuery.sortField = options.sortField
            if options.fieldFilters is not None:
                for fieldFilterStr in options.fieldFilters.split(","):
                    (fieldOp, valueStr) = fieldFilterStr.split(":")
                    query.baseRecQuery.fieldFilters[fieldOp] = float(valueStr)

            if options.numRecs is not None:
                query.numRecommendations = int(options.numRecs)
            else:
                # No recommendation count specified, then just use the same as the verify number
                query.numRecommendations = query.numVerifyItems
            query.numRecsByOrderSet = options.numRecsByOrderSet

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)