Ejemplo n.º 1
0
def main_formatMergedTTests(argv):
    ifs = stdOpen(BASE_RESULT_DIR+CONCATENATE_FILENAME);
    ofs = stdOpen(BASE_RESULT_DIR+FILTERED_FILENAME, "w");

    summaryData = {"argv": argv};
    print >> ofs, COMMENT_TAG, json.dumps(summaryData);

    outputCols = ["SortType","TopicCount","VerifyTime","Group1.precision.mean","Group1.recall.mean","Group1.normalprecision.mean","Group1.weightrecall.mean","Group1.roc_auc.mean","ttest_rel.precision","ttest_rel.recall","ttest_rel.weightrecall","ttest_rel.roc_auc","Group1.numqueryitems.mean","Group1.numverifyitems.mean","Group1.numrecommendeditems.mean","Group1.tp.mean"];
    formatter = TextResultsFormatter(ofs);
    formatter.formatTuple(outputCols);  # Output header row

    reader = TabDictReader(ifs);
    for row in reader:
        row["SortType"] = row["Group1._s"];

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None;
        if row["Group1._m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.first24hourItems.2013.1234567890.filter.bow.gz.64Topic.model"
            topicChunk = row["Group1._m"].split(".")[-2];   # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]; # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk);

        # Expecting result file name argument of the form: "results/byOrderSets/01minutes/filteredResults.tab.gz"
        timeChunk = row["args[0]"].split("/")[-2];
        timeChunk = timeChunk[:timeChunk.find("minutes")];
        row["VerifyTime"] = int(timeChunk);

        formatter.formatResultDict(row, outputCols);

    ifs.close();
    ofs.close();
    def test_numRecsByOrderSet(self):
        # Designate number of recommendations indirectly via linked order set id 

        DBUtil.execute("update clinical_item set default_recommend = 0 where clinical_item_id = -8");   # Disable default recommend on one item to shift results

        colNames = ["patient_id", "TP", "FN", "FP",  "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"];
        expectedResults = [ RowItemModel([-11111, 2, 0, 3, 1.0, 0.4, 0.571,  1.0, 0.3178, 0.4167], colNames ) ];

        # Do through fabricated prepared file intermediary
        sys.stdout = StringIO();    
        argv = ["PreparePatientItems.py","-q","2","-v","3",'0,-11111',"-"];
        self.preparer.main(argv);
        preparedDataFile = StringIO(sys.stdout.getvalue());
        
        # Artificially add a key order set ID for the fabricated data
        modFile = StringIO();
        formatter = TextResultsFormatter(modFile);
        dataCols = None;
        for i, dataRow in enumerate(TabDictReader(preparedDataFile)):
            dataRow["order_set_id"] = TEST_ORDERSET_ID;
            if i <= 0:
                dataCols = list(dataRow.keys());
                formatter.formatTuple(dataCols);    # Insert a mock record to get a header / label row
            formatter.formatResultDict(dataRow, dataCols);
        preparedDataFile = StringIO(modFile.getvalue());

        sys.stdin = preparedDataFile;   # Read prepared data file from redirected stdin
        sys.stdout = StringIO();
        #argv = ["RecommendationClassificationAnalysis.py","-P","-r","5","-m","0","-R","ItemAssociationRecommender",'-',"-"];
        argv = ["RecommendationClassificationAnalysis.py","-P","--numRecsByOrderSet","-m","0","-R","ItemAssociationRecommender",'-',"-"];
        self.analyzer.main(argv);
        textOutput = StringIO(sys.stdout.getvalue());
        self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames);
Ejemplo n.º 3
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\
                    "   <inputFileX>    Tab-delimited file of data.  Initial comment lines will be scanned for list of argv parameters to add as data columns.\n"+\
                    "                   If only a single input is given, interpret this as an index file which lists the names of the other files to concatenate (e.g., obtained with dir * /b or ls).\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-o",
            "--outputFile",
            dest="outputFile",
            help=
            "Tab-delimited file matching concatenated contents of input files.  Specify \"-\" to send to stdout."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            inputFiles = list()
            if len(args) > 1:
                for inputFilename in args:
                    inputFiles.append(stdOpen(inputFilename))
            else:  # len(argvs) == 1, Single index file rather than list of all files on command-line
                indexFile = stdOpen(args[0])
                for line in indexFile:
                    inputFilename = line.strip()
                    inputFiles.append(stdOpen(inputFilename))

            # Format the results for output
            outputFile = stdOpen(options.outputFile, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Tab-delimited output formatting
            formatter = TextResultsFormatter(outputFile)

            # Begin the file parsing so can at least get the total list of column headers
            rowGenerator = self(inputFiles)
            firstRow = rowGenerator.next()

            # Insert a mock record to get a header / label row
            colNames = self.resultHeaders()
            formatter.formatTuple(colNames)

            # Stream the concatenated data rows to the output to avoid storing all in memory
            formatter.formatResultDict(firstRow, colNames)
            for outputDict in rowGenerator:
                formatter.formatResultDict(outputDict, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
def main_formatResults(argv):
    ifs = stdOpen(BASE_RESULT_DIR + FILTERED_FILENAME)
    ofs = stdOpen(BASE_RESULT_DIR + FORMATTED_FILENAME, "w")

    summaryData = {
        "argv": argv
    }
    print >> ofs, COMMENT_TAG, json.dumps(summaryData)

    outputCols = [
        "SortType", "TopicCount", "TrainTime", "VerifyTime", "precision",
        "recall", "normalprecision", "weightrecall", "roc_auc"
    ]
    formatter = TextResultsFormatter(ofs)
    formatter.formatTuple(outputCols)
    # Output header row

    reader = TabDictReader(ifs)
    for row in reader:
        row["SortType"] = row["_s"]

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None
        row["TrainTime"] = None
        if row["_m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.firstItems.q14400.v14400.2013.1234567890.filter.bow.gz.16Topic.model"
            chunks = row["_m"].split(".")
            topicChunk = chunks[-2]
            # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]
            # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk)

            for chunk in chunks:
                if chunk[0] == "q" and chunk[-1].isdigit(
                ):  # This should be the query time in seconds
                    queryTimeSeconds = int(chunk[1:])
                    queryTimeMinutes = queryTimeSeconds / 60
                    row["TrainTime"] = queryTimeMinutes

        # Expecting training file name argument of the form: "sourceData/first24hourOrderSets.2013.q86400.v14400.-12345.tab.gz"
        row["VerifyTime"] = None
        for chunk in row["args_0_"].split("."):
            if chunk[0] == "v" and chunk[-1].isdigit(
            ):  # This should be the verify time in seconds
                verifyTimeSeconds = int(chunk[1:])
                verifyTimeMinutes = verifyTimeSeconds / 60
                row["VerifyTime"] = verifyTimeMinutes

        formatter.formatResultDict(row, outputCols)

    ifs.close()
    ofs.close()
Ejemplo n.º 5
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> <outputFile>\n"+\
                    "   <inputFile>    Tab-delimited file of data\n"+\
                    "   <ouputFile>    Tab-delimited file with relational table of t-test p-values for each sub-group pair.  Specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-l",
            "--labelCols",
            dest="labelCols",
            help=
            "Comma-separated list of the column headers to label data rows as belonging to different subgroups"
        )
        parser.add_option(
            "-v",
            "--valueCols",
            dest="valueCols",
            help=
            "Comma-separated list of the column headers for data values want to calculate statistics for"
        )
        parser.add_option(
            "-m",
            "--matchCols",
            dest="matchCols",
            help=
            "Comma-separated list of the column headers to match groups on, like row identifiers.  If not exists, then do independent t-tests rather than paired."
        )
        parser.add_option(
            "-b",
            "--baseLabels",
            dest="baseLabels",
            help=
            "Comma-separated list of values that the labelCols should have to represent which base method to compare all other methods to as a reference (otherwise do a full n^2 cartesian product of all combinations)."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 1:
            inputFile = stdOpen(args[0])
            outputFile = stdOpen(args[1], "w")

            labelCols = options.labelCols.split(",")
            valueCols = options.valueCols.split(",")
            matchCols = None
            if options.matchCols:
                matchCols = options.matchCols.split(",")
            baseLabels = None
            if options.baseLabels:
                baseLabels = options.baseLabels.split(",")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Tab-delimited output formatting
            formatter = TextResultsFormatter(outputFile)

            # Prep generator first, so will be able to extract out relevant header columns
            rowGenerator = self(inputFile, labelCols, valueCols, matchCols,
                                baseLabels)

            # Insert a mock record to get a header / label row
            colNames = self.resultHeaders(labelCols, valueCols, matchCols)
            formatter.formatResultDict(RowItemModel(colNames, colNames),
                                       colNames)

            # Stream the concatenated data rows to the output to avoid storing all in memory
            for outputDict in rowGenerator:
                formatter.formatResultDict(outputDict, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Ejemplo n.º 6
0
    for iDay in range(1, lengthOfStay):
        if (random.random() < dailyDNRRate):
            dnrDay = iDay
            dnrOccurs = True
            break
            # Don't need to keep looking
    rowData["AnyDNRatEnd"] = dnrOccurs + 0

    # Generate daily data
    for iDay in range(lengthOfStay):
        rowData["curr_day"] = rowData["start"] = iDay
        rowData["end"] = iDay + 1
        rowData["timeUntilNoMoreData"] = lengthOfStay - rowData["start"]
        rowData["timeUntilNoDataOrDNR"] = rowData["timeUntilNoMoreData"]
        if dnrOccurs:
            rowData["AnyDNR.pre"] = (iDay >= dnrDay) + 0
            rowData["AnyDNR.within1day"] = (iDay + 1 == dnrDay) + 0
            rowData["AnyDNR.postTimeDays"] = dnrDay - iDay
            if rowData["AnyDNR.postTimeDays"] < 0:
                rowData["AnyDNR.postTimeDays"] = "NA"
            else:
                rowData["timeUntilNoDataOrDNR"] = min(
                    rowData["timeUntilNoMoreData"],
                    rowData["AnyDNR.postTimeDays"])
        else:
            rowData["AnyDNR.pre"] = 0
            rowData["AnyDNR.within1day"] = 0
            rowData["AnyDNR.postTimeDays"] = "NA"
        formatter.formatResultDict(rowData, colNames)
    prog.update()
ofs.close()
Ejemplo n.º 7
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile1> <inputFile2> [<outputFile>]\n"+\
                    "   <inputFile1> Tab-delimited file with columns representing score(s) and item IDs / labels\n"+\
                    "   <inputFile2> Tab-delimited file with columns representing score(s) and item IDs / labels\n"+\
                    "   <outputFile> Tab-delimited file with columns to specify parameters and labeled rank similarity scores.\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"+\
                    " (See scripts/CDSS/rankSimilarity.py helper script to organize results of multiple queries)\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-i",
            "--idCol1",
            dest="idCol1",
            help=
            "Name of item ID column in input file 1 to join against the other input file."
        )
        parser.add_option(
            "-I",
            "--idCol2",
            dest="idCol2",
            help=
            "Name of item ID column in input file 1 to join against the other input file."
        )

        parser.add_option(
            "-s",
            "--scoreCol1",
            dest="scoreCol1",
            help=
            "Name of score column in input file 1 to sort items by, defining their rank order."
        )
        parser.add_option(
            "-d",
            "--descSort1",
            dest="descSort1",
            action="store_true",
            help=
            "If set, will sort input 1 by scoreCol1 in *descending* order.  (Important because ranking measures prioritize value of top ranks.)"
        )

        parser.add_option(
            "-S",
            "--scoreCol2",
            dest="scoreCol2",
            help=
            "Name of score column in input file 2 to sort items by, defining their rank order."
        )
        parser.add_option(
            "-D",
            "--descSort2",
            dest="descSort2",
            action="store_true",
            help=
            "If set, will sort input 2 by scoreCol2 in *descending* order.  (Important because ranking measures prioritize value of top ranks.)"
        )

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            summaryData = {
                "argv": argv
            }

            inputFile1 = stdOpen(args[0])
            inputFile2 = stdOpen(args[1])

            # Run the actual analysis
            resultDict = self(inputFile1, inputFile2, options)

            # Format the results for output
            outputFilename = None
            if len(args) > 2:
                outputFilename = args[2]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)
            formatter = TextResultsFormatter(outputFile)
            # Insert a header row
            headerCols = resultDict.keys()
            formatter.formatResultDict(
                RowItemModel(resultDict.keys(), resultDict.keys()), headerCols)
            formatter.formatResultDict(resultDict, headerCols)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)