Ejemplo n.º 1
0
    def queryItems(self, options, outputFile):
        """Query for all clinical item records that fulfill the options criteria
        and then send the results as tab-delimited output to the outputFile.
        """
        pauseSeconds = float(options.pauseSeconds)

        query = SQLQuery()
        query.addSelect(
            "cic.description, ci.clinical_item_id, ci.name, ci.description")
        query.addFrom("clinical_item_category as cic")
        query.addFrom("clinical_item as ci")
        query.addWhere(
            "cic.clinical_item_category_id = ci.clinical_item_category_id")
        if options.itemPrefix:
            query.addWhereOp("ci.description", "like",
                             options.itemPrefix + "%%")
            # Add wildcard to enabe prefix search
        if options.categoryNames:
            query.addWhereIn("cic.description",
                             options.categoryNames.split(","))
        query.addOrderBy(
            "cic.description, ci.name, ci.description, ci.clinical_item_id")

        formatter = TextResultsFormatter(outputFile)

        prog = ProgressDots()
        for row in DBUtil.execute(query,
                                  includeColumnNames=True,
                                  connFactory=self.connFactory):
            formatter.formatTuple(row)
            time.sleep(pauseSeconds)
            prog.update()
        prog.printStatus()
Ejemplo n.º 2
0
def main_formatMergedTTests(argv):
    ifs = stdOpen(BASE_RESULT_DIR+CONCATENATE_FILENAME);
    ofs = stdOpen(BASE_RESULT_DIR+FILTERED_FILENAME, "w");

    summaryData = {"argv": argv};
    print >> ofs, COMMENT_TAG, json.dumps(summaryData);

    outputCols = ["SortType","TopicCount","VerifyTime","Group1.precision.mean","Group1.recall.mean","Group1.normalprecision.mean","Group1.weightrecall.mean","Group1.roc_auc.mean","ttest_rel.precision","ttest_rel.recall","ttest_rel.weightrecall","ttest_rel.roc_auc","Group1.numqueryitems.mean","Group1.numverifyitems.mean","Group1.numrecommendeditems.mean","Group1.tp.mean"];
    formatter = TextResultsFormatter(ofs);
    formatter.formatTuple(outputCols);  # Output header row

    reader = TabDictReader(ifs);
    for row in reader:
        row["SortType"] = row["Group1._s"];

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None;
        if row["Group1._m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.first24hourItems.2013.1234567890.filter.bow.gz.64Topic.model"
            topicChunk = row["Group1._m"].split(".")[-2];   # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]; # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk);

        # Expecting result file name argument of the form: "results/byOrderSets/01minutes/filteredResults.tab.gz"
        timeChunk = row["args[0]"].split("/")[-2];
        timeChunk = timeChunk[:timeChunk.find("minutes")];
        row["VerifyTime"] = int(timeChunk);

        formatter.formatResultDict(row, outputCols);

    ifs.close();
    ofs.close();
Ejemplo n.º 3
0
    def queryItems(self, options, outputFile):
        """Query for all clinical item records that fulfill the options criteria
        and then send the results as tab-delimited output to the outputFile.
        """
        pauseSeconds = float(options.pauseSeconds)

        query = SQLQuery()
        query.addSelect("count(order_med_id_coded) as nOrders")
        query.addSelect("om.med_route, om.medication_id, om.med_description")
        query.addFrom("starr_datalake2018.order_med as om")
        if options.descriptionPrefix:
            query.addWhereOp("om.med_description", "like",
                             options.descriptionPrefix + "%%")
            # Add wildcard to enabe prefix search
        if options.medRoutes:
            query.addWhereIn("om.med_route", options.medRoutes.split(","))
        query.addGroupBy("om.medication_id, om.med_description, om.med_route")
        query.addOrderBy("nOrders desc, om.med_description")

        formatter = TextResultsFormatter(outputFile)

        prog = ProgressDots()
        for row in DBUtil.execute(query,
                                  includeColumnNames=True,
                                  connFactory=self.connFactory):
            formatter.formatTuple(row)
            time.sleep(pauseSeconds)
            prog.update()
        prog.printStatus()
    def test_numRecsByOrderSet(self):
        # Designate number of recommendations indirectly via linked order set id 

        DBUtil.execute("update clinical_item set default_recommend = 0 where clinical_item_id = -8");   # Disable default recommend on one item to shift results

        colNames = ["patient_id", "TP", "FN", "FP",  "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"];
        expectedResults = [ RowItemModel([-11111, 2, 0, 3, 1.0, 0.4, 0.571,  1.0, 0.3178, 0.4167], colNames ) ];

        # Do through fabricated prepared file intermediary
        sys.stdout = StringIO();    
        argv = ["PreparePatientItems.py","-q","2","-v","3",'0,-11111',"-"];
        self.preparer.main(argv);
        preparedDataFile = StringIO(sys.stdout.getvalue());
        
        # Artificially add a key order set ID for the fabricated data
        modFile = StringIO();
        formatter = TextResultsFormatter(modFile);
        dataCols = None;
        for i, dataRow in enumerate(TabDictReader(preparedDataFile)):
            dataRow["order_set_id"] = TEST_ORDERSET_ID;
            if i <= 0:
                dataCols = list(dataRow.keys());
                formatter.formatTuple(dataCols);    # Insert a mock record to get a header / label row
            formatter.formatResultDict(dataRow, dataCols);
        preparedDataFile = StringIO(modFile.getvalue());

        sys.stdin = preparedDataFile;   # Read prepared data file from redirected stdin
        sys.stdout = StringIO();
        #argv = ["RecommendationClassificationAnalysis.py","-P","-r","5","-m","0","-R","ItemAssociationRecommender",'-',"-"];
        argv = ["RecommendationClassificationAnalysis.py","-P","--numRecsByOrderSet","-m","0","-R","ItemAssociationRecommender",'-',"-"];
        self.analyzer.main(argv);
        textOutput = StringIO(sys.stdout.getvalue());
        self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames);
Ejemplo n.º 5
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format use generated LDA models to predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-M", "--modelFile",  dest="modelFile", help="Name of the file to load an LDA or HDP model and topic word document counts from.");
        parser.add_option("-X", "--excludeCategoryIds",  dest="excludeCategoryIds", help="For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids.");
        parser.add_option("-i", "--itemsPerCluster",  dest="itemsPerCluster", default=DEFAULT_TOPIC_ITEM_COUNT, help="Specify number of top topic items to consider when scoring recommendations.");
        parser.add_option("-m", "--minClusterWeight",  dest="minClusterWeight", default=DEFAULT_MIN_TOPIC_WEIGHT, help="When scoring recommendations, skip any topics with less than this relation weight (effectively scores as zero, but can avoid a lot of low yield calculations).");
        parser.add_option("-s", "--sortField",  dest="sortField", default=DEFAULT_SORT_FIELD, help="Score field to sort top recommendations by.  Default to posterior probabilty 'totelItemWeight', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting.");
        parser.add_option("-r", "--numRecs",   dest="numRecs",  default=DEFAULT_RECOMMENDED_ITEM_COUNT, help="Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size.");
        parser.add_option("-O", "--numRecsByOrderSet",   dest="numRecsByOrderSet", action="store_true", help="If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider.");
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        if len(args) >= 1:
            query = AnalysisQuery();
            query.preparedPatientItemFile = stdOpen(args[0]);
            query.recommender = TopicModelRecommender(options.modelFile);
            query.baseRecQuery = RecommenderQuery();
            if options.excludeCategoryIds is not None:
                query.baseRecQuery.excludeCategoryIds = set();
                for categoryIdStr in options.executeCategoryIds.split(","):
                    query.baseRecQuery.excludeCategoryIds.add(int(categoryIdStr));
            else:   # Default exclusions if none specified
                query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds();
                query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds();
            query.baseRecQuery.itemsPerCluster = int(options.itemsPerCluster);
            query.baseRecQuery.minClusterWeight = float(options.minClusterWeight);

            query.baseRecQuery.sortField = options.sortField;
            query.numRecommendations = int(options.numRecs);
            query.numRecsByOrderSet = options.numRecsByOrderSet;

            # Run the actual analysis
            analysisResults = self(query);

            # Format the results for output
            outputFilename = None;
            if len(args) > 1:
                outputFilename = args[1];
            outputFile = stdOpen(outputFilename,"w");

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {"argv": argv};
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile);

            formatter = TextResultsFormatter( outputFile );
            colNames = self.resultHeaders(query);
            formatter.formatTuple( colNames );  # Insert a mock record to get a header / label row
            formatter.formatResultDicts( analysisResults, colNames );
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
Ejemplo n.º 6
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\
                    "   <inputFileX>    Tab-delimited file of data.  Initial comment lines will be scanned for list of argv parameters to add as data columns.\n"+\
                    "                   If only a single input is given, interpret this as an index file which lists the names of the other files to concatenate (e.g., obtained with dir * /b or ls).\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-o",
            "--outputFile",
            dest="outputFile",
            help=
            "Tab-delimited file matching concatenated contents of input files.  Specify \"-\" to send to stdout."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            inputFiles = list()
            if len(args) > 1:
                for inputFilename in args:
                    inputFiles.append(stdOpen(inputFilename))
            else:  # len(argvs) == 1, Single index file rather than list of all files on command-line
                indexFile = stdOpen(args[0])
                for line in indexFile:
                    inputFilename = line.strip()
                    inputFiles.append(stdOpen(inputFilename))

            # Format the results for output
            outputFile = stdOpen(options.outputFile, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Tab-delimited output formatting
            formatter = TextResultsFormatter(outputFile)

            # Begin the file parsing so can at least get the total list of column headers
            rowGenerator = self(inputFiles)
            firstRow = rowGenerator.next()

            # Insert a mock record to get a header / label row
            colNames = self.resultHeaders()
            formatter.formatTuple(colNames)

            # Stream the concatenated data rows to the output to avoid storing all in memory
            formatter.formatResultDict(firstRow, colNames)
            for outputDict in rowGenerator:
                formatter.formatResultDict(outputDict, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
def main_formatResults(argv):
    ifs = stdOpen(BASE_RESULT_DIR + FILTERED_FILENAME)
    ofs = stdOpen(BASE_RESULT_DIR + FORMATTED_FILENAME, "w")

    summaryData = {
        "argv": argv
    }
    print >> ofs, COMMENT_TAG, json.dumps(summaryData)

    outputCols = [
        "SortType", "TopicCount", "TrainTime", "VerifyTime", "precision",
        "recall", "normalprecision", "weightrecall", "roc_auc"
    ]
    formatter = TextResultsFormatter(ofs)
    formatter.formatTuple(outputCols)
    # Output header row

    reader = TabDictReader(ifs)
    for row in reader:
        row["SortType"] = row["_s"]

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None
        row["TrainTime"] = None
        if row["_m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.firstItems.q14400.v14400.2013.1234567890.filter.bow.gz.16Topic.model"
            chunks = row["_m"].split(".")
            topicChunk = chunks[-2]
            # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]
            # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk)

            for chunk in chunks:
                if chunk[0] == "q" and chunk[-1].isdigit(
                ):  # This should be the query time in seconds
                    queryTimeSeconds = int(chunk[1:])
                    queryTimeMinutes = queryTimeSeconds / 60
                    row["TrainTime"] = queryTimeMinutes

        # Expecting training file name argument of the form: "sourceData/first24hourOrderSets.2013.q86400.v14400.-12345.tab.gz"
        row["VerifyTime"] = None
        for chunk in row["args_0_"].split("."):
            if chunk[0] == "v" and chunk[-1].isdigit(
            ):  # This should be the verify time in seconds
                verifyTimeSeconds = int(chunk[1:])
                verifyTimeMinutes = verifyTimeSeconds / 60
                row["VerifyTime"] = verifyTimeMinutes

        formatter.formatResultDict(row, outputCols)

    ifs.close()
    ofs.close()
Ejemplo n.º 8
0
def main(argv=None):
    timer = time.time()

    extractor = DataExtractor()

    # Output file
    featureMatrixFile = stdOpen("featureMatrix.SepsisICU.encounters.tab.gz",
                                "w")

    # Final columns to output to patient matrix
    colNames = list()

    patientEpisodes = extractor.parsePatientEpisodeFile(
        stdOpen("patientEpisodes.tab"), colNames)
    #patientIds = set(columnFromModelList(patientEpisodes, "patient_id"));

    log.info("Expand to index dates based start and end dates")
    # But only want one entry per patient
    patientByIndexTimeById = extractor.generateDateRangeIndexTimes(
        "edAdmitTime",
        "dischargeTime",
        patientEpisodes,
        colNames,
        timeInterval=None)

    log.info("Populate flowsheet summary statistics")
    flowsheetByNameByPatientId = extractor.parseFlowsheetFile(
        stdOpen("Flowsheet.tab.gz"))
    extractor.addFlowsheetFeatures(patientByIndexTimeById,
                                   flowsheetByNameByPatientId, FLOWSHEET_NAMES,
                                   FLOWSHEET_PRE_TIME_DELTA,
                                   FLOWSHEET_POST_TIME_DELTA, colNames)

    log.info("Populate laboratory result summary statistics")
    labsByBaseNameByPatientId = extractor.parseLabResultsFile(
        stdOpen("LabResults.tab.gz"))
    extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId,
                             LAB_BASE_NAMES, LAB_PRE_TIME_DELTA,
                             LAB_POST_TIME_DELTA, colNames)

    log.info("Populate IV Fluid accumulation")
    ivFluidsByPatientId = extractor.parseIVFluidFile(
        stdOpen("IsotonicIVFluids.tab.gz"))
    extractor.addIVFluidFeatures(patientByIndexTimeById, ivFluidsByPatientId,
                                 IVF_THRESHOLD_VOLUMES, IVF_CHECKPOINT_TIMES,
                                 colNames)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("IVAntibiotic.tab")),
        patientByIndexTimeById, colNames, "IVAntibiotic")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("BloodCulture.tab")),
        patientByIndexTimeById, colNames, "BloodCulture")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RespViralPanel.tab")),
        patientByIndexTimeById, colNames, "RespViralPanel")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")),
        patientByIndexTimeById, colNames, "AnyICULifeSupport")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")),
        patientByIndexTimeById, colNames, "AnyDNR")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")),
        patientByIndexTimeById, colNames, "AnyVasoactive")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")),
        patientByIndexTimeById, colNames, "AnyCRRT")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")),
        patientByIndexTimeById, colNames, "AnyVentilator")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")),
        patientByIndexTimeById, colNames, "ComfortCare")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")),
        patientByIndexTimeById, colNames, "PalliativeConsult")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Death.tab")),
        patientByIndexTimeById, colNames, "Death")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Birth.tab")),
        patientByIndexTimeById, colNames, "Birth")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Male.tab")),
        patientByIndexTimeById, colNames, "Male")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Female.tab")),
        patientByIndexTimeById, colNames, "Female")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteNonHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")),
        patientByIndexTimeById, colNames, "RaceAsian")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")),
        patientByIndexTimeById, colNames, "RaceHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")),
        patientByIndexTimeById, colNames, "RaceUnknown")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")),
        patientByIndexTimeById, colNames, "RaceOther")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")),
        patientByIndexTimeById, colNames, "RaceBlack")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")),
        patientByIndexTimeById, colNames, "RacePacificIslander")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")),
        patientByIndexTimeById, colNames, "RaceNativeAmerican")

    log.info(
        "Systemically Scan for Charlson comorbidities and Treatment Team categories"
    )
    for filename in os.listdir("."):
        if filename.startswith(CHARLSON_PREFIX):
            diseaseName = filename
            if filename.endswith(".tab"):
                diseaseName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, diseaseName)

        if filename.startswith(TREATMENT_TEAM_PREFIX):
            teamName = filename
            if filename.endswith(".tab"):
                teamName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, teamName)

    log.info("Output feature matrix file with row per patient day")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatTuple(colNames)
    for patientId, patientByIndexTime in patientByIndexTimeById.iteritems():
        patientResults = patientByIndexTime.values()
        formatter.formatResultDicts(patientResults, colNames)

    timer = time.time() - timer
    print >> sys.stderr, "%.3f seconds to complete" % timer
Ejemplo n.º 9
0
    "Charlson.LiverModSevere.pre", "Charlson.Malignancy.pre",
    "Charlson.MalignancyMetastatic.pre", "Charlson.MI.pre",
    "Charlson.PepticUlcer.pre", "Charlson.PeripheralVascular.pre",
    "Charlson.Renal.pre", "Charlson.Rheumatic.pre", "self_pay", "PO2A.last",
    "Pulse.last", "NA.last", "CR.last", "HCT.last", "WBC.last", "BUN.last",
    "TBIL.last", "K.last", "Resp.last", "Temp.last", "Urine.last",
    "BP_Low_Diastolic.last", "BP_High_Systolic.last",
    "Glasgow.Coma.Scale.Score.last", "TT.Cardiology.pre", "TT.CCU.HF.pre",
    "TT.CCU.pre", "TT.HemeOnc.pre", "TT.Medicine.pre", "TT.MICU.pre",
    "TT.Neurology.pre", "TT.SICU.pre", "TT.SurgerySpecialty.pre",
    "TT.Transplant.pre", "TT.Trauma.pre", "self_pay"
]

ofs = stdOpen("simulatedData.ICUDNR.tab", "w")
formatter = TextResultsFormatter(ofs)
formatter.formatTuple(colNames)
# Header row

random.seed(987654321)
# Consistent seed for reproducibility
nPatients = 10000

# Random generator parameters
ageRange = [30, 80]
incomeRange = [20000, 200000]
incomeStep = 1000
femaleRate = 0.5

# Ranges on uniform distribution to assign race labels. Leave ~50% empty for default White race
raceRangesByLabel = \
 {
Ejemplo n.º 10
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format.  Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            default=DEFAULT_RECOMMENDED_ITEM_COUNT,
            help=
            "Number of orders / items to recommend for comparison against the verification set, sorted in prevalence order.  If skip or set <1, then will use all order set items found."
        )
        parser.add_option(
            "-O",
            "--numRecsByOrderSet",
            dest="numRecsByOrderSet",
            action="store_true",
            help=
            "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            default=DEFAULT_SORT_FIELD,
            help=
            "Allow overriding of default sort field when returning ranked results (patient_count, name, description, etc.)"
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            query = AnalysisQuery()
            query.preparedPatientItemFile = stdOpen(args[0])
            query.recommender = OrderSetRecommender()
            query.baseRecQuery = RecommenderQuery()
            # Default exclusions if none specified
            query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
            )
            query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
            )
            query.baseRecQuery.sortField = options.sortField
            query.numRecommendations = int(options.numRecs)
            query.numRecsByOrderSet = options.numRecsByOrderSet

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 11
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <patientIds/dataFile> [<outputFile>]\n"+\
                    "   <patientIds/dataFile>    Name of file with patient ids.  If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for.  Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\
                    "   <outputFile>    If query yields a result set, then that will be output\n"+\
                    "                       to the named file.  Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-q",
            "--numQuery",
            dest="numQuery",
            help=
            "Number of orders / items from each patient to use as query items to prime the recommendations.  If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items"
        )
        parser.add_option(
            "-v",
            "--numVerify",
            dest="numVerify",
            help=
            "Number of orders / items from each patient after the query items to use to validate recommendations.  If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items.  If left unset, then just use all remaining orders / items for that patient"
        )
        parser.add_option(
            "-c",
            "--baseCategoryId",
            dest="baseCategoryId",
            help=
            "Instead of specifying first nQ query items, specify ID of clinical item category to look for initial items from (probably the ADMIT Dx item)."
        )
        parser.add_option(
            "-b",
            "--baseItemId",
            dest="baseItemId",
            help=
            "Instead of specifying first nQ query items, specify ID of the specific clinical item to look for initial items from."
        )
        parser.add_option(
            "-S",
            "--startDate",
            dest="startDate",
            help="Only look for test data occuring on or after this start date."
        )
        parser.add_option(
            "-E",
            "--endDate",
            dest="endDate",
            help="Only look for test data occuring before this end date.")
        parser.add_option(
            "-Q",
            "--queryTimeSpan",
            dest="queryTimeSpan",
            help=
            "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above.  Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)."
        )
        parser.add_option(
            "-V",
            "--verifyTimeSpan",
            dest="verifyTimeSpan",
            help=
            "Time frame specified in seconds over which to look for verify items after initial query item time.  Will ignore the query items that occur within the queryTimeSpan."
        )

        parser.add_option(
            "-P",
            "--preparedPatientItemFile",
            dest="preparedPatientItemFile",
            action="store_true",
            help=
            "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database."
        )

        parser.add_option(
            "-R",
            "--recommender",
            dest="recommender",
            help=
            "Name of the recommender to run the analysis against.  Options: %s"
            % list(RECOMMENDER_CLASS_BY_NAME.keys()))
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            help=
            "Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size."
        )
        parser.add_option(
            "-O",
            "--numRecsByOrderSet",
            dest="numRecsByOrderSet",
            action="store_true",
            help=
            "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            help=
            "Allow overriding of default sort field when returning ranked results"
        )
        parser.add_option(
            "-f",
            "--fieldFilters",
            dest="fieldFilters",
            help=
            "Filters to exclude results.  Comma-separated separated list of field-op:value exclusions where op is either < or > like, conditionalFreq<:0.1,frqeRatio<:1"
        )
        parser.add_option(
            "-t",
            "--timeDeltaMax",
            dest="timeDeltaMax",
            help=
            "If set, represents a time delta in seconds maximum by which recommendations should be based on.  Defaults to recommending items that occur at ANY time after the key orders.  If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items."
        )
        parser.add_option(
            "-a",
            "--aggregationMethod",
            dest="aggregationMethod",
            help=
            "Aggregation method to use for recommendations based off multiple query items.  Options: %s."
            % list(AGGREGATOR_OPTIONS))
        parser.add_option(
            "-p",
            "--countPrefix",
            dest="countPrefix",
            help=
            "Prefix for how to do counts.  Blank for default item counting allowing repeats, otherwise ignore repeats for patient_ or encounter_"
        )
        parser.add_option(
            "-m",
            "--maxRecommendedId",
            dest="maxRecommendedId",
            help=
            "Specify a maximum ID value to accept for recommended items.  More used to limit output in test cases"
        )

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            # Parse out the query parameters
            query = AnalysisQuery()
            query.recommender = RECOMMENDER_CLASS_BY_NAME[
                options.recommender]()
            query.recommender.dataManager.dataCache = dict()
            # Use a dataCache to facilitate repeat queries

            if options.preparedPatientItemFile:
                # Don't reconstruct validation data through database, just read off validation file
                query.preparedPatientItemFile = stdOpen(args[0])
            else:
                patientIdsParam = args[0]
                try:
                    # Try to open patient IDs as a file
                    patientIdFile = stdOpen(patientIdsParam)
                    query.patientIds = set(patientIdFile.read().split())
                except IOError:
                    # Unable to open as a filename, then interpret as simple comma-separated list
                    query.patientIds = set(patientIdsParam.split(","))

                if options.numQuery is not None:
                    query.numQueryItems = int(options.numQuery)
                    query.numVerifyItems = int(options.numVerify)
                else:
                    # Alternative to specify query time span starting from a key category
                    query.queryTimeSpan = timedelta(0,
                                                    int(options.queryTimeSpan))
                    query.verifyTimeSpan = timedelta(
                        0, int(options.verifyTimeSpan))

                if options.baseCategoryId is not None or options.baseItemId is not None:
                    if options.baseCategoryId is not None:
                        query.baseCategoryId = int(options.baseCategoryId)
                        # Category to look for clinical item to start accruing query items from
                    if options.baseItemId is not None:
                        query.baseItemId = int(options.baseItemId)

                if options.startDate is not None:
                    query.startDate = DBUtil.parseDateValue(options.startDate)
                if options.endDate is not None:
                    query.endDate = DBUtil.parseDateValue(options.endDate)

            query.baseRecQuery = RecommenderQuery()
            query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
            )
            query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
            )
            if options.timeDeltaMax is not None and len(
                    options.timeDeltaMax) > 0:
                query.baseRecQuery.timeDeltaMax = timedelta(
                    0, int(options.timeDeltaMax))
            if options.aggregationMethod is not None:
                query.baseRecQuery.aggregationMethod = options.aggregationMethod
            if options.countPrefix is not None:
                query.baseRecQuery.countPrefix = options.countPrefix
            if options.maxRecommendedId is not None:
                query.baseRecQuery.maxRecommendedId = int(
                    options.maxRecommendedId)
            if options.sortField is not None:
                query.baseRecQuery.sortField = options.sortField
            if options.fieldFilters is not None:
                for fieldFilterStr in options.fieldFilters.split(","):
                    (fieldOp, valueStr) = fieldFilterStr.split(":")
                    query.baseRecQuery.fieldFilters[fieldOp] = float(valueStr)

            if options.numRecs is not None:
                query.numRecommendations = int(options.numRecs)
            else:
                # No recommendation count specified, then just use the same as the verify number
                query.numRecommendations = query.numVerifyItems
            query.numRecsByOrderSet = options.numRecsByOrderSet

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 12
0
"""Given 2D Table of values, spit out "melted" long-relational form to feed into antibiogramData.js"""

import sys, os
from medinfo.common.Const import NULL_STRING
from medinfo.common.Util import stdOpen
from medinfo.db.ResultsFormatter import TabDictReader, TextResultsFormatter

ifs = stdOpen(sys.argv[1])
# Input tab delimited file
ofs = stdOpen(sys.argv[2], "w")
# "-" for stdout

reader = TabDictReader(ifs)
formatter = TextResultsFormatter(ofs)
for row in reader:
    bug = row["Bug"]
    for key in reader.fieldnames:
        value = row[key]
        if key != "Bug" and value and value != NULL_STRING:
            formatter.formatTuple([value, bug, key])
progress = ProgressDots(big=100, small=2)
for iPage in xrange(N_PAGES):
    localFilename = BASE_FILENAME % (iPage)
    localFile = open(localFilename)
    html = localFile.read()
    localFile.close()

    soup = BeautifulSoup(html)
    cells = soup("td")

    currRow = list()

    for cell in cells:
        if not allColsSeen:  # Look for col names as go, then drop just dump out rows worth as progress
            if cell["class"] not in colNames:
                colNames.append(cell["class"])
            else:
                allColsSeen = True
        if allColsSeen and len(currRow) == len(colNames):
            formatter.formatTuple(currRow)
            currRow = list()
        currRow.append(cell.text.encode('utf-8'))

    #obj = soup.find(class="views-field-field-bup-physician-last-name")

    progress.update()
progress.printStatus()

ofs.close()
Ejemplo n.º 14
0
    def test_performance(self):
        """
        Test performance against DataExtractor.
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Initialize FeatureMatrixFactory.
        factoryStart = time.time()
        self.factory = FeatureMatrixFactory()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Add clinical item features.
        self.factory.addClinicalItemFeatures(["PerfItem300"])
        self.factory.addClinicalItemFeatures(["PerfItem400"])
        self.factory.addClinicalItemFeatures(["PerfItem500"])

        # Add lab result features.
        self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta,
                                          postTimeDelta)

        # Add flowsheet features.
        self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta,
                                          postTimeDelta)

        # Build matrix.
        self.factory.buildFeatureMatrix()

        # Stop timer.
        factoryStop = time.time()

        # Initialize DataExtractor.
        extractorStart = time.time()
        extractor = DataExtractor()
        extractor.dataCache = dict()

        # Initialize output file.
        outFile = open("extractor.feature_matrix.tab.gz", "w")
        formatter = TextResultsFormatter(outFile)

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Process patient episodes.
        patientEpisodes = list()
        row = cursor.fetchone()

        while row is not None:
            (pat_id, order_proc_id, proc_code, order_time,
             normal_results) = row
            patientEpisode = \
                RowItemModel \
                (
                    {
                        "patient_id": pat_id,
                        "order_proc_id": order_proc_id,
                        "proc_code": proc_code,
                        "order_time": order_time,
                        "result_normal_count": normal_results
                    }
                )
            patientEpisodes.append(patientEpisode)
            row = cursor.fetchone()

        # Initialize patient data.
        lastPatientId = None
        colNames = None
        patientEpisodeByIndexTime = None

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Populate patient data.
        tempColNames = \
            ["patient_id", "order_proc_id", "proc_code", "order_time",
                "result_normal_count"]
        for patientEpisode in patientEpisodes:
            patientId = patientEpisode["patient_id"]

            if lastPatientId is not None and lastPatientId != patientId:
                # New patient ID so start querying for patient specific data and
                # populating patient episode data.

                # Clinical Item (PerfItem300)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem300",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem400)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem400",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem500)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem500",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Lab Result (Foo)
                labResultTable = extractor.queryLabResults(["Foo"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Bar)
                labResultTable = extractor.queryLabResults(["Bar"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Baz)
                labResultTable = extractor.queryLabResults(["Baz"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Qux)
                labResultTable = extractor.queryLabResults(["Qux"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
                    preTimeDelta, postTimeDelta))

                # Flowsheet (Perflow)
                # tempFile = StringIO()
                # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile)
                # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\
                #     StringIO(tempFile.getvalue()))
                # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\
                #     patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \
                #     ["Perflow"], preTimeDelta, postTimeDelta, tempColNames))

                if colNames is None:
                    # First row, print header row
                    colNames = tempColNames
                    formatter.formatTuple(colNames)

                # Print out patient (episode) data (one row per episode)
                formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                            colNames)

            if lastPatientId is None or lastPatientId != patientId:
                # Prepare to aggregate patient episode record per patient
                patientEpisodeByIndexTime = dict()

            patientEpisodeByIndexTime[
                patientEpisode["order_time"]] = patientEpisode
            lastPatientId = patientId
            outFile.flush()

        # Last Iteration
        patientId = lastPatientId
        # Clinical Item (PerfItem300)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem300",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem400)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem400",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem500)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem500",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Lab Result (Foo)
        labResultTable = extractor.queryLabResults(["Foo"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Bar)
        labResultTable = extractor.queryLabResults(["Bar"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Baz)
        labResultTable = extractor.queryLabResults(["Baz"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Qux)
        labResultTable = extractor.queryLabResults(["Qux"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
            preTimeDelta, postTimeDelta))

        formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                    colNames)

        # Close file.
        outFile.close()

        # Stop timer.
        extractorStop = time.time()

        # Compare results.
        factoryTime = factoryStop - factoryStart
        extractorTime = extractorStop - extractorStart
        self.assertTrue(extractorTime > factoryTime)

        # Clean up feature matrix files.
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except OSError:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
Ejemplo n.º 15
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format.  Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-X",
            "--excludeCategoryIds",
            dest="excludeCategoryIds",
            help=
            "For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            default=DEFAULT_SORT_FIELD,
            help=
            "Score field to sort top recommendations by.  Default to posterior probabilty / positive predictive value 'P(B|A)', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting."
        )
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            default=DEFAULT_RECOMMENDED_ITEM_COUNT,
            help=
            "Number of orders / items to recommend for comparison against the verification set."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            query = AnalysisQuery()
            query.preparedPatientItemFile = stdOpen(args[0])
            query.recommender = OrderSetRecommender()
            query.baseRecQuery = RecommenderQuery()
            if options.excludeCategoryIds is not None:
                query.baseRecQuery.excludeCategoryIds = set()
                for categoryIdStr in options.executeCategoryIds.split(","):
                    query.baseRecQuery.excludeCategoryIds.add(
                        int(categoryIdStr))
            else:  # Default exclusions if none specified
                query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
                )
                query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
                )

            query.baseRecQuery.sortField = options.sortField
            query.numRecommendations = int(options.numRecs)

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)