Example #1
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <sourceFile> [<outputFile>]\n"+\
                    "   <sourceFile>     Source file of text data from data warehouse\n"+\
                    "   <outputFile>    HTML report file with analysis of source\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        BaseTextAnalysis.addParserOptions(self, parser);
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        if len(args) > 0:
            BaseTextAnalysis.parseOptions(self, options);
            
            sourceFile = stdOpen(args[0]);

            # Format the results for output
            outputFilename = None;
            if len(args) > 1:
                outputFilename = args[1];
            outputFile = stdOpen(outputFilename,"w");

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {"argv": argv};
            print("<!-- %s -->" % json.dumps(summaryData), file=outputFile);

            # Run the actual analysis
            self(sourceFile, outputFile);
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
def main_prepPatientItems(argv):
    prep = PreparePatientItems();
    for i in xrange(-9,10):
        prep.main(["PreparePatientItems","-S",START_DATE_STR, "-E",END_DATE_STR, "-p",str(DEMOGRAPHICS_CATEGORY_ID),"-c", str(ADMIT_DX_CATEGORY_ID), "-Q", str(QUERY_TIME), "-V", str(VERIFY_TIME), "-o", str(DEATH_ITEM_ID), "-t", "2592000", "%s/patientIds.5year.%s.tab.gz" % (SOURCE_DATA_DIR, i), "%s/firstItems.q%s.v%s.%s.%s.tab.gz" % (SOURCE_DATA_DIR, QUERY_TIME, VERIFY_TIME, BASE_YEAR, i)]);
    
    # Convert to (filtered) Bag of Words
    for i in xrange(-9,10):
        prep.main(["PreparePatientItems","-B","qvo","-X", EXCLUDE_CATEGORY_IDS_STR,"%s/firstItems.q%s.v%s.%s.%d.tab.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR,i),"%s/firstItems.q%s.v%s.%s.%d.filter.bow.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR,i)]);

    # Concatenate batch of files
    ofs = stdOpen("%s/firstItems.q%s.v%s.%s.1234567890.filter.bow.gz" % (SOURCE_DATA_DIR, QUERY_TIME, VERIFY_TIME, BASE_YEAR),"w");
    for i in [1,2,3,4,5,6,7,8,9,0]:
        ifs = stdOpen("%s/firstItems.q%s.v%s.%s.%d.filter.bow.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR,i) );
        ofs.write(ifs.read());
        ifs.close();
    ofs.close();
    
    # For comment and header row of csv files, drop repeats
    baseIds = [-1,-2,-3,-4,-5];
    ofs = stdOpen("%s/firstItems.q%s.v%s.%s.-12345.tab.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR),"w");
    isHeaderRowWritten = False;
    for baseId in baseIds:
        ifs = stdOpen("%s/firstItems.q%s.v%s.%s.%d.tab.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR,baseId) );
        for iLine, line in enumerate(ifs):
            if not line.startswith(COMMENT_TAG):    # Skip comment lines
                if line[0].isalpha():   # Starts with a letter/label, must be header row, not data
                    if isHeaderRowWritten:
                        continue;   # Skip text/header rows, except for the first one encountered
                    else:
                        isHeaderRowWritten = True;
                ofs.write(line);
        ifs.close();
    ofs.close();
Example #3
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog <inputFile> <outputFile>\n"+\
                    "   <inputFile>     Tab-delimited input file taken from schedule Excel file. Example data format as seen in test case examples. See support/extractExcelSheets.py for help on pulling out Excel sheets into tab-delimited data files.\n"+\
                    "   <outputFile>    File to output results to.  Designate '-' for stdout.";
        parser = OptionParser(usage=usageStr)
        parser.add_option("-i", "--providerIdFilename",  dest="providerIdFilename", help="Name of provider ID CSV file. If provided, then add column for prov_id based on resident first_name and last_name, match within first "+DEFAULT_INDEX_PREFIX_LENGTH+" characters, or generate ID value if no match found");
        parser.add_option("-y", "--baseYear",  dest="baseYear", help="Year expect dates to start in.");
        parser.add_option("-t", "--changeTime",  dest="changeTime", default=CHANGE_TIME, help="Hour of day that count as delimiter between rotations. Likely should NOT be midnight = 0, because night shifts span midnight. Default to 7 = 7am.");
        (options, args) = parser.parse_args(argv[1:])

        if len(args) >= 2 and options.baseYear:
            log.info("Starting: "+str.join(" ", argv))
            timer = time.time();

            baseYear = int(options.baseYear);

            if options.providerIdFilename is not None:
                providerReader = csv.DictReader(open(options.providerIdFilename));
                self.loadProviderModels( providerReader );

            inFile = stdOpen(args[0]);
            scheduleItems = self.parseScheduleItems(inFile, baseYear);

            outFile = stdOpen(args[1],"w");
            formatter = TextResultsFormatter(outFile);
            formatter.formatResultDicts(scheduleItems);
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
def main_formatMergedTTests(argv):
    ifs = stdOpen(BASE_RESULT_DIR+CONCATENATE_FILENAME);
    ofs = stdOpen(BASE_RESULT_DIR+FILTERED_FILENAME, "w");

    summaryData = {"argv": argv};
    print >> ofs, COMMENT_TAG, json.dumps(summaryData);

    outputCols = ["SortType","TopicCount","VerifyTime","Group1.precision.mean","Group1.recall.mean","Group1.normalprecision.mean","Group1.weightrecall.mean","Group1.roc_auc.mean","ttest_rel.precision","ttest_rel.recall","ttest_rel.weightrecall","ttest_rel.roc_auc","Group1.numqueryitems.mean","Group1.numverifyitems.mean","Group1.numrecommendeditems.mean","Group1.tp.mean"];
    formatter = TextResultsFormatter(ofs);
    formatter.formatTuple(outputCols);  # Output header row

    reader = TabDictReader(ifs);
    for row in reader:
        row["SortType"] = row["Group1._s"];

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None;
        if row["Group1._m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.first24hourItems.2013.1234567890.filter.bow.gz.64Topic.model"
            topicChunk = row["Group1._m"].split(".")[-2];   # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]; # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk);

        # Expecting result file name argument of the form: "results/byOrderSets/01minutes/filteredResults.tab.gz"
        timeChunk = row["args[0]"].split("/")[-2];
        timeChunk = timeChunk[:timeChunk.find("minutes")];
        row["VerifyTime"] = int(timeChunk);

        formatter.formatResultDict(row, outputCols);

    ifs.close();
    ofs.close();
Example #5
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format use generated LDA models to predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-M", "--modelFile",  dest="modelFile", help="Name of the file to load an LDA or HDP model and topic word document counts from.");
        parser.add_option("-X", "--excludeCategoryIds",  dest="excludeCategoryIds", help="For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids.");
        parser.add_option("-i", "--itemsPerCluster",  dest="itemsPerCluster", default=DEFAULT_TOPIC_ITEM_COUNT, help="Specify number of top topic items to consider when scoring recommendations.");
        parser.add_option("-m", "--minClusterWeight",  dest="minClusterWeight", default=DEFAULT_MIN_TOPIC_WEIGHT, help="When scoring recommendations, skip any topics with less than this relation weight (effectively scores as zero, but can avoid a lot of low yield calculations).");
        parser.add_option("-s", "--sortField",  dest="sortField", default=DEFAULT_SORT_FIELD, help="Score field to sort top recommendations by.  Default to posterior probabilty 'totelItemWeight', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting.");
        parser.add_option("-r", "--numRecs",   dest="numRecs",  default=DEFAULT_RECOMMENDED_ITEM_COUNT, help="Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size.");
        parser.add_option("-O", "--numRecsByOrderSet",   dest="numRecsByOrderSet", action="store_true", help="If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider.");
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        if len(args) >= 1:
            query = AnalysisQuery();
            query.preparedPatientItemFile = stdOpen(args[0]);
            query.recommender = TopicModelRecommender(options.modelFile);
            query.baseRecQuery = RecommenderQuery();
            if options.excludeCategoryIds is not None:
                query.baseRecQuery.excludeCategoryIds = set();
                for categoryIdStr in options.executeCategoryIds.split(","):
                    query.baseRecQuery.excludeCategoryIds.add(int(categoryIdStr));
            else:   # Default exclusions if none specified
                query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds();
                query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds();
            query.baseRecQuery.itemsPerCluster = int(options.itemsPerCluster);
            query.baseRecQuery.minClusterWeight = float(options.minClusterWeight);

            query.baseRecQuery.sortField = options.sortField;
            query.numRecommendations = int(options.numRecs);
            query.numRecsByOrderSet = options.numRecsByOrderSet;

            # Run the actual analysis
            analysisResults = self(query);

            # Format the results for output
            outputFilename = None;
            if len(args) > 1:
                outputFilename = args[1];
            outputFile = stdOpen(outputFilename,"w");

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {"argv": argv};
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile);

            formatter = TextResultsFormatter( outputFile );
            colNames = self.resultHeaders(query);
            formatter.formatTuple( colNames );  # Insert a mock record to get a header / label row
            formatter.formatResultDicts( analysisResults, colNames );
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
Example #6
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\
                    "   <inputFileX>    Tab-delimited file of data.  Initial comment lines will be scanned for list of argv parameters to add as data columns.\n"+\
                    "                   If only a single input is given, interpret this as an index file which lists the names of the other files to concatenate (e.g., obtained with dir * /b or ls).\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-o",
            "--outputFile",
            dest="outputFile",
            help=
            "Tab-delimited file matching concatenated contents of input files.  Specify \"-\" to send to stdout."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            inputFiles = list()
            if len(args) > 1:
                for inputFilename in args:
                    inputFiles.append(stdOpen(inputFilename))
            else:  # len(argvs) == 1, Single index file rather than list of all files on command-line
                indexFile = stdOpen(args[0])
                for line in indexFile:
                    inputFilename = line.strip()
                    inputFiles.append(stdOpen(inputFilename))

            # Format the results for output
            outputFile = stdOpen(options.outputFile, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Tab-delimited output formatting
            formatter = TextResultsFormatter(outputFile)

            # Begin the file parsing so can at least get the total list of column headers
            rowGenerator = self(inputFiles)
            firstRow = rowGenerator.next()

            # Insert a mock record to get a header / label row
            colNames = self.resultHeaders()
            formatter.formatTuple(colNames)

            # Stream the concatenated data rows to the output to avoid storing all in memory
            formatter.formatResultDict(firstRow, colNames)
            for outputDict in rowGenerator:
                formatter.formatResultDict(outputDict, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Example #7
0
def main(argv=None):
    timer = time.time()

    # Final columns to output to patient matrix
    colNames = list()

    patientById = parsePatientFile(stdOpen("patients.tab"), colNames)

    labsByBaseNameByPatientId = parseLabResultsFile(stdOpen("labs.tab"))
    addLabFeatures(labsByBaseNameByPatientId, patientById, colNames,
                   INDEX_ITEM_BASE_NAME, LAB_BASE_NAMES, LAB_PRE_TIME,
                   LAB_POST_TIME)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    itemTimesByPatientId = parseClinicalItemFile(stdOpen("admitDx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ICD9.208-AdmitDx")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("problemListDx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ICD9.208-ProblemListDx")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("feSO4Rx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironSO4")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("allEnteralIron.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironEnteral")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("ironIV.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironIV")

    itemTimesByPatientId = parseClinicalItemFile(
        stdOpen("outpatientIronRx.tab"),
        patientIdCol="pat_id",
        timeCol="ordering_date")
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironOutpatient")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("transfusions.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "RBCTransfusion")

    patientResults = filterPatients(patientById)

    log.info("Output feature matrix file with row per patient")
    featureMatrixFile = stdOpen("featureMatrix.lab14to1day.tab", "w")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatResultDicts(patientResults, colNames, addHeaderRow=True)

    timer = time.time() - timer
    print("%.3f seconds to complete" % timer, file=sys.stderr)
Example #8
0
def main(argv):
    timer = time.time()

    infile = stdOpen(argv[1])
    outfile = stdOpen(argv[2], "w")

    summaryData = {
        "argv": argv
    }
    print >> outfile, COMMENT_TAG, json.dumps(summaryData)

    df = pd.read_csv(infile, na_values=[NULL_STRING])
    df["normal"] = 1 - df["abnormal"]
    # Use not-abnormal as output of interest. Should be same as all_result_normal, but some labs not labeled

    # Prepare output dataframe skeleton
    resultDF = pd.DataFrame()
    nRows = len(df)
    floatNRows = float(nRows)
    # Facilitate subsequent floating point division
    for iRow in xrange(nRows):
        topK = iRow + 1
        # Top K items considered
        topKPercent = topK / floatNRows
        # Top Percentage of all items considered
        resultDF.set_value(iRow, "iRow", iRow)
        resultDF.set_value(iRow, "Top K", topK)
        resultDF.set_value(iRow, "Top K %", topKPercent)

    for col in df.columns:
        if col not in labelCols and col not in resultCols:
            # Any leftover should be a predicted test result / score, correlated with the outcome column
            scoreCol = col
            print >> sys.stderr, scoreCol
            scoreResultCol = scoreCol  #+".precisionAtK";
            if scoreResultCol.startswith("predictedTest."):
                scoreResultCol = scoreResultCol[len("predictedTest."):]
                # Clean up (trim off) name prefixes
            df.sort(scoreCol, ascending=False, inplace=True)
            # Descending sort by the score column

            countNormal = 0.0
            countAll = 0
            iRow = 0
            for index, row in df.iterrows():
                countAll += 1
                countNormal += row[outcomeCol]
                precisionAtK = countNormal / countAll
                #print >> sys.stderr, precisionAtK, row[[outcomeCol,scoreCol]];
                resultDF.set_value(iRow, scoreResultCol, precisionAtK)
                iRow += 1

    print >> sys.stderr, "output"
    resultDF.to_csv(outfile)

    return df
Example #9
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\
                    "   <inputFileX>    Tab-delimited files of data, should have a key column with a unique identifier to merge across files.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-c",
            "--keyList",
            dest="keyList",
            help=
            "Comma-separated list of column identifiers to find in the input files to know what to merge on."
        )
        parser.add_option(
            "-s",
            "--suffixList",
            dest="suffixList",
            help=
            "Comma-separated list of suffixes to add to non-key column names in common across merged files"
        )
        parser.add_option(
            "-o",
            "--outputFile",
            dest="outputFile",
            help=
            "Tab-delimited file containing merged contents of input files.  Specify \"-\" to send to stdout."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting %s: %s" % (os.getpid(), str.join(" ", argv)))
        timer = time.time()
        if len(args) > 1:
            inputFiles = list()
            for inputFilename in args:
                inputFiles.append(stdOpen(inputFilename))

            keyList = options.keyList.split(",")
            suffixList = options.suffixList.split(",")

            # Format the results for output
            outputFile = stdOpen(options.outputFile, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            self(inputFiles, keyList, suffixList, outputFile)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
def main_formatResults(argv):
    ifs = stdOpen(BASE_RESULT_DIR + FILTERED_FILENAME)
    ofs = stdOpen(BASE_RESULT_DIR + FORMATTED_FILENAME, "w")

    summaryData = {
        "argv": argv
    }
    print >> ofs, COMMENT_TAG, json.dumps(summaryData)

    outputCols = [
        "SortType", "TopicCount", "TrainTime", "VerifyTime", "precision",
        "recall", "normalprecision", "weightrecall", "roc_auc"
    ]
    formatter = TextResultsFormatter(ofs)
    formatter.formatTuple(outputCols)
    # Output header row

    reader = TabDictReader(ifs)
    for row in reader:
        row["SortType"] = row["_s"]

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None
        row["TrainTime"] = None
        if row["_m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.firstItems.q14400.v14400.2013.1234567890.filter.bow.gz.16Topic.model"
            chunks = row["_m"].split(".")
            topicChunk = chunks[-2]
            # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]
            # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk)

            for chunk in chunks:
                if chunk[0] == "q" and chunk[-1].isdigit(
                ):  # This should be the query time in seconds
                    queryTimeSeconds = int(chunk[1:])
                    queryTimeMinutes = queryTimeSeconds / 60
                    row["TrainTime"] = queryTimeMinutes

        # Expecting training file name argument of the form: "sourceData/first24hourOrderSets.2013.q86400.v14400.-12345.tab.gz"
        row["VerifyTime"] = None
        for chunk in row["args_0_"].split("."):
            if chunk[0] == "v" and chunk[-1].isdigit(
            ):  # This should be the verify time in seconds
                verifyTimeSeconds = int(chunk[1:])
                verifyTimeMinutes = verifyTimeSeconds / 60
                row["VerifyTime"] = verifyTimeMinutes

        formatter.formatResultDict(row, outputCols)

    ifs.close()
    ofs.close()
Example #11
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <trainFile> <testFile> [<outputFile>]\n"+\
                    "   <trainFile> Tab-delimited file, queryItemIdsJSON expected to be parseable into lists of query items as well as an outcome.X column\n"+\
                    "   <testFile> Same structure as trainFile, but with test cases to assess prediction scoring\n"+\
                    "   <outputFile>    Tab-delimited that can be used for ROC analysis with columns for outcome and predicted score\n"+\
                    ""
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-o",
            "--outcomeItemId",
            dest="outcomeItemId",
            help="Outcome item IDs to assess get prediction scores for")

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 2:
            trainFile = stdOpen(args[0])
            testFile = stdOpen(args[1])

            outcomeId = int(options.outcomeItemId)

            # Run the actual analysis
            (featureMatrix, outcomeMatrix, queryIds,
             rowModels) = self.fileToMatrixes(trainFile, outcomeId)
            model = self.train(featureMatrix, outcomeMatrix)
            analysisResults = self.predict(testFile, model, queryIds,
                                           outcomeId)

            # Format the results for output
            outputFilename = None
            if len(args) > 2:
                outputFilename = args[2]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile)

            colNames = self.analysisHeaders(outcomeId)
            analysisResults.insert(0, RowItemModel(colNames, colNames))
            # Insert a mock record to get a header / label row

            formatter = TextResultsFormatter(outputFile)
            formatter.formatResultDicts(analysisResults, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Example #12
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile> Tab-delimited file, first two labeled columns expected to represent labeled outcome (0 and non-zero) and score/probability of outcome\n"+\
                    "   <outputFile>    Tab-delimited table specifying score histogram bin widths, total cases, predicted events, actual events\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-b", "--bins",  dest="nBins",  default=10,    help="Number of bins to separate scores into, defaults to deciles (10)");
        parser.add_option("-f", "--figure",  dest="figure",  help="If set, will also try to auto-generate an example figure and store to a file here");

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        if len(args) > 1:
            inputFilename = args[0];
            inputFile = stdOpen(inputFilename);
            
            # Run the actual analysis
            analysisResults = self(inputFile, int(options.nBins));
            
            (hlStat, degFreedom, hlP) = self.calculateHosmerLemeshow(analysisResults);
            
            # Generate plot figure
            if options.figure is not None:
                self.generateFigure(analysisResults, options.figure);

            # Format the results for output
            outputFilename = None;
            if len(args) > 1:
                outputFilename = args[1];
            outputFile = stdOpen(outputFilename,"w");
            
            # Print comment line with arguments to allow for deconstruction later as well as extra results
            print >> outputFile, COMMENT_TAG, json.dumps({"argv":argv, "P-HosmerLemeshow": hlP});

            colNames = self.analysisHeaders();
            analysisResults.insert(0, RowItemModel(colNames,colNames) );    # Insert a mock record to get a header / label row
            
            formatter = TextResultsFormatter( outputFile );
            formatter.formatResultDicts( analysisResults, colNames );

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
Example #13
0
    def loadUpdateBufferFromFile(self, filename):
        updateBuffer = None;
        try:
            #print >> sys.stderr, filename
            log.info("Loading: %s" % filename);
            ifs = stdOpen(filename, "r")
            updateBuffer = json.load(ifs)
            updateBuffer["analyzedPatientItemIds"] = set(updateBuffer["analyzedPatientItemIds"])
            ifs.close()
        except IOError as exc:
            # Apparently could not find the named filename. See if instead it's a prefix
            #    for a series of enumerated files and then merge them into one mass buffer
            dirname = os.path.dirname(filename);
            if dirname == "": dirname = ".";    # Implicitly the current working directory
            basename = os.path.basename(filename);
            for nextFilename in os.listdir(dirname):
                if nextFilename.startswith(basename):
                    nextFilepath = os.path.join(dirname, nextFilename);
                    nextUpdateBuffer = self.loadUpdateBufferFromFile(nextFilepath);
                    if updateBuffer is None:    # First update buffer, use it as base
                        updateBuffer = nextUpdateBuffer;
                    else:    # Have existing update buffer. Just update it's contents with the next one
                        updateBuffer = self.mergeBuffers(updateBuffer, nextUpdateBuffer);
                        del nextUpdateBuffer;	# Make sure memory gets reclaimed

        return updateBuffer;
Example #14
0
    def saveBufferToFile (self, filename, updateBuffer):
        ofs = stdOpen (filename, "w");
        updateBuffer["analyzedPatientItemIds"] = list(updateBuffer["analyzedPatientItemIds"]);
        json.dump(updateBuffer, ofs);
        ofs.close();

        # Wipe out buffer to reflect incremental changes done, so any new ones should be recorded fresh
        updateBuffer = self.makeUpdateBuffer(updateBuffer);
Example #15
0
def main_mergeTTestResults(argv):
    mod = ConcatenateDataFiles();
    fileListFile = stdOpen(BASE_RESULT_DIR+FILELIST_FILENAME, "w");
    for resultDir in RESULT_DIRS:
        print(resultDir+TTEST_FILENAME, file=fileListFile);
    fileListFile.close();
    subargv = ["ConcatenateDataFiles.py","-o", BASE_RESULT_DIR+CONCATENATE_FILENAME];
    subargv.append(BASE_RESULT_DIR+FILELIST_FILENAME);
    mod.main(subargv);
Example #16
0
def main(argv):
    timer = time.time()
    inFile = stdOpen(argv[1])
    outFile = stdOpen(argv[2], "w")
    df = dataFrame = pd.read_table(inFile, parse_dates=DATE_COLS)

    df = binarizeGender(df)
    df = binarizeRace(df, RACE_OPTIONS, DEFAULT_RACE)
    df = binarizePeriod(df)
    df["hasDrugScreens"] = (df["nDrugScreens"] > 0) * 1
    # Convert to binary outcome measure

    df.to_csv(outFile, sep="\t", index=False)

    elapsed = time.time() - timer
    print >> sys.stderr, "%s seconds to complete" % timedelta(
        0, round(elapsed))

    return df
Example #17
0
	def main (self, argv):
		"""Main method, callable from command line"""
		usageStr =  "usage: %prog [options] <patientIds>\n"+\
					"   <patientIds>    Comma-separated list of patient IDs to run the analysis on, or use option to specify a file.\n"
		parser = OptionParser(usage=usageStr)
		parser.add_option("-i", "--idFile", dest="idFile", help="If provided, look for patient IDs in then named file, one ID per line, in the format '/Users/Muthu/Desktop/JonathanChen/patientlist.txt'")
		parser.add_option("-s", "--startDate", dest="startDate", metavar="<startDate>",  help="Date string (e.g., 2011-12-15), must be provided, will start analysis on items occuring on or after this date.");
		parser.add_option("-e", "--endDate", dest="endDate", metavar="<endDate>",  help="Date string (e.g., 2011-12-15), must be provided, will stop analysis on items occuring before this date.");
		parser.add_option("-w", "--window", type="int", dest="window", metavar="<window>",  help="Window integer (e.g., 36), (unit is deltas, i.e. a window of 36 and a delta of 4 weeks means that after 36 x4 weeks, the data is decayed ~1/e ~ 0.37). More precisely, the window x delta is how long it will take for the data to decay to 38 percent of its original worth. Higher delta means it takes longer to decay. This number must be provided.");
		parser.add_option("-d", "--delta", type="int", dest="delta", metavar="<delta>",  help="Delta integer (e.g., 4), (unit of time is weeks, defaults to 4 weeks), define in what increments do you want to read in the data. After each increment/delta, it performs a decay.");
		parser.add_option("-a", "--associationsPerCommit", type="int", dest="associationsPerCommit", help="If provided, will commit incremental analysis results to the database when accrue this many association items.  Can help to avoid allowing accrual of too much buffered items whose runtime memory will exceed the 32bit 2GB program limit.")
		parser.add_option("-u", "--itemsPerUpdate", type="int", dest="itemsPerUpdate", help="If provided, when updating patient_item analyze_dates, will only update this many items at a time to avoid overloading MySQL query.")
		parser.add_option("-o", "--outputFile", dest="outputFile", help="If provided, send buffer to output file rather than commiting to database")
		(options, args) = parser.parse_args(argv[1:])

		decayAnalysisOptions = DecayAnalysisOptions()

		log.debug("starting process");

		#set start and end dates, item length (delta), and decay rate
		decayAnalysisOptions.startD = datetime.strptime(options.startDate, DATE_FORMAT) #makes a datetime object for the start and end date
		decayAnalysisOptions.endD = datetime.strptime(options.endDate, DATE_FORMAT)
		decayAnalysisOptions.windowLength = options.window #how many deltas in your window
		decayAnalysisOptions.delta = timedelta(weeks=options.delta)
		if options.associationsPerCommit is not None:
			decayAnalysisOptions.associationsPerCommit = options.associationsPerCommit
		if options.itemsPerUpdate is not None:	
			decayAnalysisOptions.itemsPerUpdate = options.itemsPerUpdate

		if options.delta != None:
			decayAnalysisOptions.delta = timedelta(weeks=(options.delta)) #length of one decay item

		if options.outputFile is not None:
			decayAnalysisOptions.outputFile = options.outputFile

		#set patientIds based on either a file input or args
		decayAnalysisOptions.patientIds = list()
		if len(args) > 0:
			decayAnalysisOptions.patientIds.extend(args[0].split(","))
		if options.idFile is not None:
			idFile = stdOpen(options.idFile)
			for line in idFile:
				decayAnalysisOptions.patientIds.append(line.strip())
		

		#quit if invalid parameters
		if decayAnalysisOptions.startD is None or decayAnalysisOptions.endD is None or options.window is None or options.window == 0 or decayAnalysisOptions.patientIds is None:
			parser.print_help()
			sys.exit(0)


		log.debug("global start and end date");
		log.debug(decayAnalysisOptions.startD, decayAnalysisOptions.endD, decayAnalysisOptions.windowLength);
		self.decayAnalyzePatientItems(decayAnalysisOptions)
Example #18
0
def main_concatenate(argv):
    mod = ConcatenateDataFiles();
    for resultDir in RESULT_DIRS:
        fileListFile = stdOpen(resultDir+FILELIST_FILENAME, "w");
        for filename in os.listdir(resultDir):
            if filename.startswith(RESULT_BASENAME):  
                print(resultDir+filename, file=fileListFile);
        fileListFile.close();
        subargv = ["ConcatenateDataFiles.py","-o",resultDir+CONCATENATE_FILENAME];
        subargv.append(resultDir+FILELIST_FILENAME);
        mod.main(subargv);
Example #19
0
def parsePatientFile(patientFile, colNames):
    log.info("Parse patient file");
    patientFile = stdOpen("patients.tab");
    patientById = dict();
    for patient in TabDictReader(patientFile):
        patientId = int(patient["patient_id"]);
        patient["patient_id"] = patientId;
        patientById[patientId] = patient;

    colNames.extend(["patient_id","dialysis","surgery"]);
    return patientById;    
Example #20
0
 def loadDocCountByWordId(self, filename):
     """Given the name of a top topics file,
     load the section reporting the overall word document counts
     """
     docCountByWordId = dict()
     reader = TabDictReader(stdOpen(filename))
     for topicItem in reader:
         if topicItem[
                 "topic_id"] == NULL_STRING:  # All document section, not topic specific
             itemId = None
             if topicItem["item_id"] != NULL_STRING:
                 itemId = int(topicItem["item_id"])
             docCount = int(topicItem["score"])
             docCountByWordId[itemId] = docCount
     return docCountByWordId
Example #21
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> <outputFile>\n"+\
                    "   <inputFile>    Tab-delimited file of relational data. Specify \"-\" to read from stdin.\n"+\
                    "   <ouputFile>    Tab-delimited file relational query data results.  Specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-q",
            "--sqlQuery",
            dest="sqlQuery",
            help=
            "SQL Query to execute on the input data file/table. Use default tablename '%s' in query."
            % DEFAULT_TABLENAME)
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 1:
            inputFile = stdOpen(args[0])
            outputFile = stdOpen(args[1], "w")
            sqlQuery = options.sqlQuery

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Primary execution to load inputFile, run query, then drop results into outputFile
            dataFrame = self(sqlQuery, inputFile, outputFile)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Example #22
0
def main(argv):
    timer = time.time()

    patientById = queryPatients(stdOpen("patients.tab", "w"))
    #queryLabResults(stdOpen("labs.tab","w"), patientById); # Time intensive, ~20 minutes

    queryClinicalItems(stdOpen("transfusions.tab", "w"), (3648, ), patientById)
    # RBC Transfusions
    queryClinicalItems(stdOpen("problemListDx.tab", "w"),
                       (14568, 14606, 14847, 20636), patientById)
    # Iron Def Anemia Problem List
    queryClinicalItems(stdOpen("admitDx.tab", "w"), (17172, 20125, 21873),
                       patientById)
    # Iron Def Admission Diagnosis

    ##################################################
    # Iron prescription ID notes

    # Mostly oral supplements and vitamins, dominated by ferrous sulfate
    ironSulfateItemIds = (34, 1044, 1047, 1280, 1386)
    # Mostly first one for PO route, others are smattering of feeding tube route
    # Dominated by first PO Multi-Vitamin (2800 vs 90 for second)
    otherEnteralIronItemIds = (83, 208, 349, 732, 1188, 1376, 1460, 1707, 1768,
                               1996, 2000, 2085, 2140, 2162, 2322, 2569, 2855,
                               3124, 3130, 3234, 3241, 3242, 3305, 3309, 3367,
                               3380, 3384, 3414, 3532)
    allEnteralIronItemIds = set(ironSulfateItemIds).union(
        otherEnteralIronItemIds)
    # IV iron formulations
    ivIronClinicalItemIds = (893, 1129, 720, 1304, 1490, 3403)

    queryClinicalItems(stdOpen("feSO4Rx.tab", "w"), ironSulfateItemIds,
                       patientById)
    # FeSO4 (Enteral, primarily PO)
    queryClinicalItems(stdOpen("allEnteralIron.tab", "w"),
                       allEnteralIronItemIds, patientById)
    # All Enteral Iron formulations, including FeSO4, FeGluconate, and assorted MVI, etc.
    queryClinicalItems(stdOpen("ironIV.tab", "w"), ivIronClinicalItemIds,
                       patientById)
    # IV Iron (sucrose, dextran, gluconate, etc.)

    queryOutpatientIronRx(stdOpen("outpatientIronRx.tab", "w"), patientById)

    timer = time.time() - timer
    print >> sys.stderr, "%.3f seconds to complete" % timer
Example #23
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "Query for the clinical_item records that exist with the specified criteria\n"+\
                    "usage: %prog [options] [<outputFile>]\n"+\
                    "   <outputFile>    Results file. Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-i",
            "--itemPrefix",
            dest="itemPrefix",
            help=
            "Look for clinical_items whose description starts with this prefix."
        )
        parser.add_option(
            "-c",
            "--categoryNames",
            dest="categoryNames",
            help=
            "Comma separated list of clinical_item_category.descriptions to look for."
        )
        parser.add_option(
            "-p",
            "--pauseSeconds",
            dest="pauseSeconds",
            default="0",
            help="Number of seconds to pause between processing each record.")
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            outputFile = stdOpen(args[0], "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            self.queryItems(options, outputFile)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Example #24
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <patientIds>\n"+\
                    "   <patientIds>    Patient ID file, or comma-separated list of patient IDs.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-s",
            "--itemIdSequence",
            dest="itemIdSequence",
            help=
            "Comma-separated sequence of item IDs to look for as representing the end of a triple of interest."
        )
        parser.add_option(
            "-v",
            "--virtualItemId",
            dest="virtualItemId",
            help=
            "ID of virtual clinical item to record against if find a specified triple."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()

        patientIds = set()
        patientIdsParam = args[0]
        try:
            # Try to open patient IDs as a file
            patientIdFile = stdOpen(patientIdsParam)
            patientIds.update(patientIdFile.read().split())
        except IOError:
            # Unable to open as a filename, then interpret as simple comma-separated list
            patientIds.update(patientIdsParam.split(","))

        itemIdSequence = [
            int(idStr) for idStr in options.itemIdSequence.split(",")
        ]
        virtualItemId = int(options.virtualItemId)

        self.analyzePatientItems(patientIds, itemIdSequence, virtualItemId)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Example #25
0
    def transform_STRIDE_source_table(stride_source_table):
        # Get module for doing data conversion.
        transformer = ClinicalItemDataLoader.STRIDE_TABLE_TRANSFORMER_MAP[
            stride_source_table]

        # Build command.
        if stride_source_table == 'stride_patient':
            argv = ['python', '-m', transformer]
        elif stride_source_table == 'stride_preadmit_med':
            argv = ['python', '-m', transformer, '-m', '5', '-s', '2008-01-01']
        elif stride_source_table == 'stride_order_med':
            argv = [
                'python', '-m', transformer, '-m', '5', '-d', '5', '-s',
                '2008-01-01'
            ]
        elif stride_source_table == 'stride_treatment_team':
            argv = ['python', '-m', transformer, '-a', '-s', '2008-01-01']
        else:
            argv = ['python', '-m', transformer, '-a', '-s', '2008-01-01']

        # Call command.
        log_file = stdOpen('%s.log' % ('_'.join(argv)), 'w')
        subprocess.call(argv, stderr=log_file)
Example #26
0
    'FERRITIN': 'FERRITIN(FER)',
    'TBIL': 'TOTAL BILIRUBIN(TBIL)',
    'WBC': 'WBC(WBC)',
    'CR': 'CREATININE, SER/PLAS(CR)',
    'HAPTO': 'HAPTOGLOBIN(HAP)',
    'MCV': 'MCV(MCV)',
    'RETICAB': 'RETIC, ABS(RETABS)',
    'HGB': 'HGB(CALC), ISTAT',
    'YSTFRR': 'SOL TRANSFERR REC',
    'TRFSAT': 'TRANSFERRIN SAT',
    'FE': 'IRON, TOTAL'
}

timer = time.time()

featureMatrixFile = stdOpen("featureMatrix.tab")

log.info("Parse feature matrix file")
patientById = dict()
for patient in TabDictReader(featureMatrixFile):
    patientId = int(patient["patient_id"])
    patient["patient_id"] = patientId
    for labBaseName in labBaseNames:
        if patient[labBaseName] == NULL_STRING:
            patient[labBaseName] = None
        else:
            patient[labBaseName] = float(patient[labBaseName])
    patientById[patientId] = patient

log.info("Create plots against each metric against the index lab")
for labBaseName in labBaseNames:
Example #27
0
def main(argv=None):
    timer = time.time()

    extractor = DataExtractor()

    # Output file
    featureMatrixFile = stdOpen("featureMatrix.SepsisICU.encounters.tab.gz",
                                "w")

    # Final columns to output to patient matrix
    colNames = list()

    patientEpisodes = extractor.parsePatientEpisodeFile(
        stdOpen("patientEpisodes.tab"), colNames)
    #patientIds = set(columnFromModelList(patientEpisodes, "patient_id"));

    log.info("Expand to index dates based start and end dates")
    # But only want one entry per patient
    patientByIndexTimeById = extractor.generateDateRangeIndexTimes(
        "edAdmitTime",
        "dischargeTime",
        patientEpisodes,
        colNames,
        timeInterval=None)

    log.info("Populate flowsheet summary statistics")
    flowsheetByNameByPatientId = extractor.parseFlowsheetFile(
        stdOpen("Flowsheet.tab.gz"))
    extractor.addFlowsheetFeatures(patientByIndexTimeById,
                                   flowsheetByNameByPatientId, FLOWSHEET_NAMES,
                                   FLOWSHEET_PRE_TIME_DELTA,
                                   FLOWSHEET_POST_TIME_DELTA, colNames)

    log.info("Populate laboratory result summary statistics")
    labsByBaseNameByPatientId = extractor.parseLabResultsFile(
        stdOpen("LabResults.tab.gz"))
    extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId,
                             LAB_BASE_NAMES, LAB_PRE_TIME_DELTA,
                             LAB_POST_TIME_DELTA, colNames)

    log.info("Populate IV Fluid accumulation")
    ivFluidsByPatientId = extractor.parseIVFluidFile(
        stdOpen("IsotonicIVFluids.tab.gz"))
    extractor.addIVFluidFeatures(patientByIndexTimeById, ivFluidsByPatientId,
                                 IVF_THRESHOLD_VOLUMES, IVF_CHECKPOINT_TIMES,
                                 colNames)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("IVAntibiotic.tab")),
        patientByIndexTimeById, colNames, "IVAntibiotic")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("BloodCulture.tab")),
        patientByIndexTimeById, colNames, "BloodCulture")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RespViralPanel.tab")),
        patientByIndexTimeById, colNames, "RespViralPanel")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")),
        patientByIndexTimeById, colNames, "AnyICULifeSupport")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")),
        patientByIndexTimeById, colNames, "AnyDNR")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")),
        patientByIndexTimeById, colNames, "AnyVasoactive")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")),
        patientByIndexTimeById, colNames, "AnyCRRT")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")),
        patientByIndexTimeById, colNames, "AnyVentilator")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")),
        patientByIndexTimeById, colNames, "ComfortCare")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")),
        patientByIndexTimeById, colNames, "PalliativeConsult")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Death.tab")),
        patientByIndexTimeById, colNames, "Death")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Birth.tab")),
        patientByIndexTimeById, colNames, "Birth")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Male.tab")),
        patientByIndexTimeById, colNames, "Male")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Female.tab")),
        patientByIndexTimeById, colNames, "Female")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteNonHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")),
        patientByIndexTimeById, colNames, "RaceAsian")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")),
        patientByIndexTimeById, colNames, "RaceHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")),
        patientByIndexTimeById, colNames, "RaceUnknown")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")),
        patientByIndexTimeById, colNames, "RaceOther")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")),
        patientByIndexTimeById, colNames, "RaceBlack")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")),
        patientByIndexTimeById, colNames, "RacePacificIslander")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")),
        patientByIndexTimeById, colNames, "RaceNativeAmerican")

    log.info(
        "Systemically Scan for Charlson comorbidities and Treatment Team categories"
    )
    for filename in os.listdir("."):
        if filename.startswith(CHARLSON_PREFIX):
            diseaseName = filename
            if filename.endswith(".tab"):
                diseaseName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, diseaseName)

        if filename.startswith(TREATMENT_TEAM_PREFIX):
            teamName = filename
            if filename.endswith(".tab"):
                teamName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, teamName)

    log.info("Output feature matrix file with row per patient day")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatTuple(colNames)
    for patientId, patientByIndexTime in patientByIndexTimeById.iteritems():
        patientResults = patientByIndexTime.values()
        formatter.formatResultDicts(patientResults, colNames)

    timer = time.time() - timer
    print >> sys.stderr, "%.3f seconds to complete" % timer
Example #28
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> <outputFile>\n"+\
                    "   <inputFile>    Tab-delimited file of data\n"+\
                    "   <ouputFile>    Tab-delimited file with relational table of t-test p-values for each sub-group pair.  Specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-l",
            "--labelCols",
            dest="labelCols",
            help=
            "Comma-separated list of the column headers to label data rows as belonging to different subgroups"
        )
        parser.add_option(
            "-v",
            "--valueCols",
            dest="valueCols",
            help=
            "Comma-separated list of the column headers for data values want to calculate statistics for"
        )
        parser.add_option(
            "-m",
            "--matchCols",
            dest="matchCols",
            help=
            "Comma-separated list of the column headers to match groups on, like row identifiers.  If not exists, then do independent t-tests rather than paired."
        )
        parser.add_option(
            "-b",
            "--baseLabels",
            dest="baseLabels",
            help=
            "Comma-separated list of values that the labelCols should have to represent which base method to compare all other methods to as a reference (otherwise do a full n^2 cartesian product of all combinations)."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 1:
            inputFile = stdOpen(args[0])
            outputFile = stdOpen(args[1], "w")

            labelCols = options.labelCols.split(",")
            valueCols = options.valueCols.split(",")
            matchCols = None
            if options.matchCols:
                matchCols = options.matchCols.split(",")
            baseLabels = None
            if options.baseLabels:
                baseLabels = options.baseLabels.split(",")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Tab-delimited output formatting
            formatter = TextResultsFormatter(outputFile)

            # Prep generator first, so will be able to extract out relevant header columns
            rowGenerator = self(inputFile, labelCols, valueCols, matchCols,
                                baseLabels)

            # Insert a mock record to get a header / label row
            colNames = self.resultHeaders(labelCols, valueCols, matchCols)
            formatter.formatResultDict(RowItemModel(colNames, colNames),
                                       colNames)

            # Stream the concatenated data rows to the output to avoid storing all in memory
            for outputDict in rowGenerator:
                formatter.formatResultDict(outputDict, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Example #29
0
    "Charlson.Diabetes.pre", "Charlson.DiabetesComplications.pre",
    "Charlson.HemiplegiaParaplegia.pre", "Charlson.LiverMild.pre",
    "Charlson.LiverModSevere.pre", "Charlson.Malignancy.pre",
    "Charlson.MalignancyMetastatic.pre", "Charlson.MI.pre",
    "Charlson.PepticUlcer.pre", "Charlson.PeripheralVascular.pre",
    "Charlson.Renal.pre", "Charlson.Rheumatic.pre", "self_pay", "PO2A.last",
    "Pulse.last", "NA.last", "CR.last", "HCT.last", "WBC.last", "BUN.last",
    "TBIL.last", "K.last", "Resp.last", "Temp.last", "Urine.last",
    "BP_Low_Diastolic.last", "BP_High_Systolic.last",
    "Glasgow.Coma.Scale.Score.last", "TT.Cardiology.pre", "TT.CCU.HF.pre",
    "TT.CCU.pre", "TT.HemeOnc.pre", "TT.Medicine.pre", "TT.MICU.pre",
    "TT.Neurology.pre", "TT.SICU.pre", "TT.SurgerySpecialty.pre",
    "TT.Transplant.pre", "TT.Trauma.pre", "self_pay"
]

ofs = stdOpen("simulatedData.ICUDNR.tab", "w")
formatter = TextResultsFormatter(ofs)
formatter.formatTuple(colNames)
# Header row

random.seed(987654321)
# Consistent seed for reproducibility
nPatients = 10000

# Random generator parameters
ageRange = [30, 80]
incomeRange = [20000, 200000]
incomeStep = 1000
femaleRate = 0.5

# Ranges on uniform distribution to assign race labels. Leave ~50% empty for default White race
Example #30
0
def main(argv):
    """Main method, callable from command line"""
    usageStr =  "usage: %prog [options] <query> [<outputFile>]\n"+\
                "   <query>         Query to execute (probably enclosed in quotes (\"))\n"+\
                "   <outputFile>    If query yields a result set, then that will be output\n"+\
                "                       to the named file.  Specify \"-\" to send to stdout.\n"
    parser = OptionParser(usage=usageStr)
    parser.add_option("-c", "--incCols",    dest="incCols",     action="store_true",    help="If set when executing a SELECT statement, then a line will be added before the result set with the names of the data columns.")
    parser.add_option("-C", "--incCommand", dest="incCommand",  action="store_true",    help="If set when executing a SELECT statement, then add a comment header line with the command-line argv.")
    parser.add_option("-s", "--script",     dest="script",      action="store_true",    help="Interpret the first argument <query> as the name of a DB (SQL) script to run instead.  Use \"-\" to specify stdin.")
    parser.add_option("-i", "--input",      dest="input",       metavar="<inputFile>",  help="Open the named whitespace-delimted file and insert its contents into the database.  Use with -t option.  The remaining \"normal\" arguments are expected and will be taken as the ordered list of column names the file data is to be inserted under.  Alternatively, the first row of the file will be considered the column names.  Use \"-\" to specify stdin.")
    parser.add_option("-u", "--update",     dest="update",      metavar="<dataFile>",   help="Open the named whitespace-delimted file and update its contents into the database.  Use with -t and -n options.  The remaining \"normal\" arguments are expected and will be taken as the ordered list of column names the file data is to be updated under.  Alternatively, the first row of the file will be considered the column names.  Use \"-\" to specify stdin.");
    parser.add_option("-t", "--table",      dest="table",       metavar="<tableName>",  help="If inserting / updating a file with the -i or -u option, specify the name of the DB table to insert into")
    parser.add_option("-d", "--delim",      dest="delim",       metavar="<delimiter>",  help="If inserting / updating a file with the -i or -u  option, specify the character to delimit values by.  Default to \\t tabs, but can specify something else. Alternatively, this can be used to specify what delimiter to use when formatting query output.")
    parser.add_option("-n", "--nIdCols",    dest="nIdCols",     default="1",            help="If updating a file with the -u  option, assume that the first column is the ID column not to update into the database, but to identify the respective row to update.  If more than 1 column is needed, specify with this option.")
    parser.add_option("-o", "--output",     dest="output",      metavar="<outputFile>", help="If inserting a file with the -i option and want to get generated ID numbers from the inserted rows, specify this file to send them to.")
    parser.add_option("-e", "--skipErrors", dest="skipErrors",  action="store_true",    help="If inserting or updating a file or running a script with the -s option, keep running the remainder of the inserts or script commands even if one causes an exception.")
    parser.add_option("-f", "--dateColFormats", dest="dateColFormats",  metavar="<dateColFormats>",    help="If inserting a file, can specify columns that should be interpreted as date strings to be parsed into datetime objects.  Provide comma-separated list, and optional | separated Python date parsing format (e.g., 'MyDateTime1|%m/%d/%Y %H:%M:%S,MyDateTime2').  http://docs.python.org/library/datetime.html#strftime-strptime-behavior.")
    parser.add_option("-x", "--escapeStrings", dest="escapeStrings",  action="store_true",    help="If inserting a file, can set whether to run all input strings through escape filter to avoid special characters compromising inserts.")
    (options, args) = parser.parse_args(argv[1:])

    # Correct escape character delimiter
    if options.delim == "\\t":  options.delim = "\t";

    log.info("Starting: "+str.join(" ", argv))
    timer = time.time();
    if options.script and len(args) > 0:
        runDBScript( stdOpen(args[0],"r",sys.stdin), options.skipErrors )
    elif options.input is not None and options.table is not None:
        inputFile   = stdOpen(options.input,"r",sys.stdin)
        outputFile  = None
        if options.output != None:
            outputFile = stdOpen(options.output,"w",sys.stdout)
        
        dateColFormats = None;
        if options.dateColFormats is not None:
            dateColFormats = dict();
            colDateFormatComponents = options.dateColFormats.split(",");
            for colDateFormatComponent in colDateFormatComponents:
                colFormatChunks = colDateFormatComponent.split("|");
                colName = colFormatChunks[0];
                formatStr = None;
                if len(colFormatChunks) > 1:
                    formatStr = colFormatChunks[1];
                dateColFormats[colName] = formatStr;
        
        # If reading from a file (not standard input stream), do an extra pass to get size estimate to facilitate progress tracker
        estInput = None;
        if not isStdFile(options.input):
            lineCountFile = stdOpen(options.input);
            estInput = fileLineCount(lineCountFile);

        nInserts = insertFile( inputFile, options.table, args, options.delim, outputFile, options.skipErrors, dateColFormats=dateColFormats, escapeStrings=options.escapeStrings, estInput=estInput );
        log.info("%d rows successfully inserted",nInserts)
    elif options.update is not None and options.table is not None:
        sourceFile  = stdOpen(options.update,"r",sys.stdin);
        nIdCols = int(options.nIdCols);
        nUpdates = updateFromFile( sourceFile, options.table, args, nIdCols, options.delim, options.skipErrors );
        log.info("%d row updates completed",nUpdates);
    elif len(args) > 0:
        outFile = "-"   # Default to stdout if no outputFile specified
        if len(args) > 1: outFile = args[1]
        outFile = stdOpen( outFile, "w", sys.stdout )
        
        if options.incCommand:
            summaryData = {"argv": argv};
            print >> outFile, COMMENT_TAG, json.dumps(summaryData);

        textFormatter = TextResultsFormatter(outFile, options.delim)

        results = execute( args[0], formatter=textFormatter, includeColumnNames=options.incCols )

        log.info("%d rows affected (or other return code)",results);
    else:
        parser.print_help()
        sys.exit(-1)

    timer = time.time() - timer;
    log.info("%.3f seconds to complete",timer);