def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <sourceFile> [<outputFile>]\n"+\ " <sourceFile> Source file of text data from data warehouse\n"+\ " <outputFile> HTML report file with analysis of source\n"+\ " Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) BaseTextAnalysis.addParserOptions(self, parser); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); if len(args) > 0: BaseTextAnalysis.parseOptions(self, options); sourceFile = stdOpen(args[0]); # Format the results for output outputFilename = None; if len(args) > 1: outputFilename = args[1]; outputFile = stdOpen(outputFilename,"w"); # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = {"argv": argv}; print("<!-- %s -->" % json.dumps(summaryData), file=outputFile); # Run the actual analysis self(sourceFile, outputFile); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def main_prepPatientItems(argv): prep = PreparePatientItems(); for i in xrange(-9,10): prep.main(["PreparePatientItems","-S",START_DATE_STR, "-E",END_DATE_STR, "-p",str(DEMOGRAPHICS_CATEGORY_ID),"-c", str(ADMIT_DX_CATEGORY_ID), "-Q", str(QUERY_TIME), "-V", str(VERIFY_TIME), "-o", str(DEATH_ITEM_ID), "-t", "2592000", "%s/patientIds.5year.%s.tab.gz" % (SOURCE_DATA_DIR, i), "%s/firstItems.q%s.v%s.%s.%s.tab.gz" % (SOURCE_DATA_DIR, QUERY_TIME, VERIFY_TIME, BASE_YEAR, i)]); # Convert to (filtered) Bag of Words for i in xrange(-9,10): prep.main(["PreparePatientItems","-B","qvo","-X", EXCLUDE_CATEGORY_IDS_STR,"%s/firstItems.q%s.v%s.%s.%d.tab.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR,i),"%s/firstItems.q%s.v%s.%s.%d.filter.bow.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR,i)]); # Concatenate batch of files ofs = stdOpen("%s/firstItems.q%s.v%s.%s.1234567890.filter.bow.gz" % (SOURCE_DATA_DIR, QUERY_TIME, VERIFY_TIME, BASE_YEAR),"w"); for i in [1,2,3,4,5,6,7,8,9,0]: ifs = stdOpen("%s/firstItems.q%s.v%s.%s.%d.filter.bow.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR,i) ); ofs.write(ifs.read()); ifs.close(); ofs.close(); # For comment and header row of csv files, drop repeats baseIds = [-1,-2,-3,-4,-5]; ofs = stdOpen("%s/firstItems.q%s.v%s.%s.-12345.tab.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR),"w"); isHeaderRowWritten = False; for baseId in baseIds: ifs = stdOpen("%s/firstItems.q%s.v%s.%s.%d.tab.gz" % (SOURCE_DATA_DIR,QUERY_TIME,VERIFY_TIME,BASE_YEAR,baseId) ); for iLine, line in enumerate(ifs): if not line.startswith(COMMENT_TAG): # Skip comment lines if line[0].isalpha(): # Starts with a letter/label, must be header row, not data if isHeaderRowWritten: continue; # Skip text/header rows, except for the first one encountered else: isHeaderRowWritten = True; ofs.write(line); ifs.close(); ofs.close();
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog <inputFile> <outputFile>\n"+\ " <inputFile> Tab-delimited input file taken from schedule Excel file. Example data format as seen in test case examples. See support/extractExcelSheets.py for help on pulling out Excel sheets into tab-delimited data files.\n"+\ " <outputFile> File to output results to. Designate '-' for stdout."; parser = OptionParser(usage=usageStr) parser.add_option("-i", "--providerIdFilename", dest="providerIdFilename", help="Name of provider ID CSV file. If provided, then add column for prov_id based on resident first_name and last_name, match within first "+DEFAULT_INDEX_PREFIX_LENGTH+" characters, or generate ID value if no match found"); parser.add_option("-y", "--baseYear", dest="baseYear", help="Year expect dates to start in."); parser.add_option("-t", "--changeTime", dest="changeTime", default=CHANGE_TIME, help="Hour of day that count as delimiter between rotations. Likely should NOT be midnight = 0, because night shifts span midnight. Default to 7 = 7am."); (options, args) = parser.parse_args(argv[1:]) if len(args) >= 2 and options.baseYear: log.info("Starting: "+str.join(" ", argv)) timer = time.time(); baseYear = int(options.baseYear); if options.providerIdFilename is not None: providerReader = csv.DictReader(open(options.providerIdFilename)); self.loadProviderModels( providerReader ); inFile = stdOpen(args[0]); scheduleItems = self.parseScheduleItems(inFile, baseYear); outFile = stdOpen(args[1],"w"); formatter = TextResultsFormatter(outFile); formatter.formatResultDicts(scheduleItems); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def main_formatMergedTTests(argv): ifs = stdOpen(BASE_RESULT_DIR+CONCATENATE_FILENAME); ofs = stdOpen(BASE_RESULT_DIR+FILTERED_FILENAME, "w"); summaryData = {"argv": argv}; print >> ofs, COMMENT_TAG, json.dumps(summaryData); outputCols = ["SortType","TopicCount","VerifyTime","Group1.precision.mean","Group1.recall.mean","Group1.normalprecision.mean","Group1.weightrecall.mean","Group1.roc_auc.mean","ttest_rel.precision","ttest_rel.recall","ttest_rel.weightrecall","ttest_rel.roc_auc","Group1.numqueryitems.mean","Group1.numverifyitems.mean","Group1.numrecommendeditems.mean","Group1.tp.mean"]; formatter = TextResultsFormatter(ofs); formatter.formatTuple(outputCols); # Output header row reader = TabDictReader(ifs); for row in reader: row["SortType"] = row["Group1._s"]; # Extract out numerical data from filename text parameters row["TopicCount"] = None; if row["Group1._m"] != 'None': # Expecting model name strings of the form: "models/topicModel.first24hourItems.2013.1234567890.filter.bow.gz.64Topic.model" topicChunk = row["Group1._m"].split(".")[-2]; # Expect second to last period-delimited chunk to contain topic count topicChunk = topicChunk[:topicChunk.find("Topic")]; # Remove trailing Topic text row["TopicCount"] = int(topicChunk); # Expecting result file name argument of the form: "results/byOrderSets/01minutes/filteredResults.tab.gz" timeChunk = row["args[0]"].split("/")[-2]; timeChunk = timeChunk[:timeChunk.find("minutes")]; row["VerifyTime"] = int(timeChunk); formatter.formatResultDict(row, outputCols); ifs.close(); ofs.close();
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Validation file in prepared result file format use generated LDA models to predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\ " <outputFile> Validation result stat summaries.\n" parser = OptionParser(usage=usageStr) parser.add_option("-M", "--modelFile", dest="modelFile", help="Name of the file to load an LDA or HDP model and topic word document counts from."); parser.add_option("-X", "--excludeCategoryIds", dest="excludeCategoryIds", help="For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids."); parser.add_option("-i", "--itemsPerCluster", dest="itemsPerCluster", default=DEFAULT_TOPIC_ITEM_COUNT, help="Specify number of top topic items to consider when scoring recommendations."); parser.add_option("-m", "--minClusterWeight", dest="minClusterWeight", default=DEFAULT_MIN_TOPIC_WEIGHT, help="When scoring recommendations, skip any topics with less than this relation weight (effectively scores as zero, but can avoid a lot of low yield calculations)."); parser.add_option("-s", "--sortField", dest="sortField", default=DEFAULT_SORT_FIELD, help="Score field to sort top recommendations by. Default to posterior probabilty 'totelItemWeight', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting."); parser.add_option("-r", "--numRecs", dest="numRecs", default=DEFAULT_RECOMMENDED_ITEM_COUNT, help="Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size."); parser.add_option("-O", "--numRecsByOrderSet", dest="numRecsByOrderSet", action="store_true", help="If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); if len(args) >= 1: query = AnalysisQuery(); query.preparedPatientItemFile = stdOpen(args[0]); query.recommender = TopicModelRecommender(options.modelFile); query.baseRecQuery = RecommenderQuery(); if options.excludeCategoryIds is not None: query.baseRecQuery.excludeCategoryIds = set(); for categoryIdStr in options.executeCategoryIds.split(","): query.baseRecQuery.excludeCategoryIds.add(int(categoryIdStr)); else: # Default exclusions if none specified query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(); query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(); query.baseRecQuery.itemsPerCluster = int(options.itemsPerCluster); query.baseRecQuery.minClusterWeight = float(options.minClusterWeight); query.baseRecQuery.sortField = options.sortField; query.numRecommendations = int(options.numRecs); query.numRecsByOrderSet = options.numRecsByOrderSet; # Run the actual analysis analysisResults = self(query); # Format the results for output outputFilename = None; if len(args) > 1: outputFilename = args[1]; outputFile = stdOpen(outputFilename,"w"); # Print comment line with analysis arguments to allow for deconstruction later summaryData = {"argv": argv}; print(COMMENT_TAG, json.dumps(summaryData), file=outputFile); formatter = TextResultsFormatter( outputFile ); colNames = self.resultHeaders(query); formatter.formatTuple( colNames ); # Insert a mock record to get a header / label row formatter.formatResultDicts( analysisResults, colNames ); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\ " <inputFileX> Tab-delimited file of data. Initial comment lines will be scanned for list of argv parameters to add as data columns.\n"+\ " If only a single input is given, interpret this as an index file which lists the names of the other files to concatenate (e.g., obtained with dir * /b or ls).\n" parser = OptionParser(usage=usageStr) parser.add_option( "-o", "--outputFile", dest="outputFile", help= "Tab-delimited file matching concatenated contents of input files. Specify \"-\" to send to stdout." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 0: inputFiles = list() if len(args) > 1: for inputFilename in args: inputFiles.append(stdOpen(inputFilename)) else: # len(argvs) == 1, Single index file rather than list of all files on command-line indexFile = stdOpen(args[0]) for line in indexFile: inputFilename = line.strip() inputFiles.append(stdOpen(inputFilename)) # Format the results for output outputFile = stdOpen(options.outputFile, "w") # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print >> outputFile, COMMENT_TAG, json.dumps(summaryData) # Tab-delimited output formatting formatter = TextResultsFormatter(outputFile) # Begin the file parsing so can at least get the total list of column headers rowGenerator = self(inputFiles) firstRow = rowGenerator.next() # Insert a mock record to get a header / label row colNames = self.resultHeaders() formatter.formatTuple(colNames) # Stream the concatenated data rows to the output to avoid storing all in memory formatter.formatResultDict(firstRow, colNames) for outputDict in rowGenerator: formatter.formatResultDict(outputDict, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(argv=None): timer = time.time() # Final columns to output to patient matrix colNames = list() patientById = parsePatientFile(stdOpen("patients.tab"), colNames) labsByBaseNameByPatientId = parseLabResultsFile(stdOpen("labs.tab")) addLabFeatures(labsByBaseNameByPatientId, patientById, colNames, INDEX_ITEM_BASE_NAME, LAB_BASE_NAMES, LAB_PRE_TIME, LAB_POST_TIME) log.info( "Record presence of items in terms of relative time to each item from index time" ) itemTimesByPatientId = parseClinicalItemFile(stdOpen("admitDx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ICD9.208-AdmitDx") itemTimesByPatientId = parseClinicalItemFile(stdOpen("problemListDx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ICD9.208-ProblemListDx") itemTimesByPatientId = parseClinicalItemFile(stdOpen("feSO4Rx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironSO4") itemTimesByPatientId = parseClinicalItemFile(stdOpen("allEnteralIron.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironEnteral") itemTimesByPatientId = parseClinicalItemFile(stdOpen("ironIV.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironIV") itemTimesByPatientId = parseClinicalItemFile( stdOpen("outpatientIronRx.tab"), patientIdCol="pat_id", timeCol="ordering_date") addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironOutpatient") itemTimesByPatientId = parseClinicalItemFile(stdOpen("transfusions.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "RBCTransfusion") patientResults = filterPatients(patientById) log.info("Output feature matrix file with row per patient") featureMatrixFile = stdOpen("featureMatrix.lab14to1day.tab", "w") formatter = TextResultsFormatter(featureMatrixFile) formatter.formatResultDicts(patientResults, colNames, addHeaderRow=True) timer = time.time() - timer print("%.3f seconds to complete" % timer, file=sys.stderr)
def main(argv): timer = time.time() infile = stdOpen(argv[1]) outfile = stdOpen(argv[2], "w") summaryData = { "argv": argv } print >> outfile, COMMENT_TAG, json.dumps(summaryData) df = pd.read_csv(infile, na_values=[NULL_STRING]) df["normal"] = 1 - df["abnormal"] # Use not-abnormal as output of interest. Should be same as all_result_normal, but some labs not labeled # Prepare output dataframe skeleton resultDF = pd.DataFrame() nRows = len(df) floatNRows = float(nRows) # Facilitate subsequent floating point division for iRow in xrange(nRows): topK = iRow + 1 # Top K items considered topKPercent = topK / floatNRows # Top Percentage of all items considered resultDF.set_value(iRow, "iRow", iRow) resultDF.set_value(iRow, "Top K", topK) resultDF.set_value(iRow, "Top K %", topKPercent) for col in df.columns: if col not in labelCols and col not in resultCols: # Any leftover should be a predicted test result / score, correlated with the outcome column scoreCol = col print >> sys.stderr, scoreCol scoreResultCol = scoreCol #+".precisionAtK"; if scoreResultCol.startswith("predictedTest."): scoreResultCol = scoreResultCol[len("predictedTest."):] # Clean up (trim off) name prefixes df.sort(scoreCol, ascending=False, inplace=True) # Descending sort by the score column countNormal = 0.0 countAll = 0 iRow = 0 for index, row in df.iterrows(): countAll += 1 countNormal += row[outcomeCol] precisionAtK = countNormal / countAll #print >> sys.stderr, precisionAtK, row[[outcomeCol,scoreCol]]; resultDF.set_value(iRow, scoreResultCol, precisionAtK) iRow += 1 print >> sys.stderr, "output" resultDF.to_csv(outfile) return df
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\ " <inputFileX> Tab-delimited files of data, should have a key column with a unique identifier to merge across files.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-c", "--keyList", dest="keyList", help= "Comma-separated list of column identifiers to find in the input files to know what to merge on." ) parser.add_option( "-s", "--suffixList", dest="suffixList", help= "Comma-separated list of suffixes to add to non-key column names in common across merged files" ) parser.add_option( "-o", "--outputFile", dest="outputFile", help= "Tab-delimited file containing merged contents of input files. Specify \"-\" to send to stdout." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting %s: %s" % (os.getpid(), str.join(" ", argv))) timer = time.time() if len(args) > 1: inputFiles = list() for inputFilename in args: inputFiles.append(stdOpen(inputFilename)) keyList = options.keyList.split(",") suffixList = options.suffixList.split(",") # Format the results for output outputFile = stdOpen(options.outputFile, "w") # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) self(inputFiles, keyList, suffixList, outputFile) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main_formatResults(argv): ifs = stdOpen(BASE_RESULT_DIR + FILTERED_FILENAME) ofs = stdOpen(BASE_RESULT_DIR + FORMATTED_FILENAME, "w") summaryData = { "argv": argv } print >> ofs, COMMENT_TAG, json.dumps(summaryData) outputCols = [ "SortType", "TopicCount", "TrainTime", "VerifyTime", "precision", "recall", "normalprecision", "weightrecall", "roc_auc" ] formatter = TextResultsFormatter(ofs) formatter.formatTuple(outputCols) # Output header row reader = TabDictReader(ifs) for row in reader: row["SortType"] = row["_s"] # Extract out numerical data from filename text parameters row["TopicCount"] = None row["TrainTime"] = None if row["_m"] != 'None': # Expecting model name strings of the form: "models/topicModel.firstItems.q14400.v14400.2013.1234567890.filter.bow.gz.16Topic.model" chunks = row["_m"].split(".") topicChunk = chunks[-2] # Expect second to last period-delimited chunk to contain topic count topicChunk = topicChunk[:topicChunk.find("Topic")] # Remove trailing Topic text row["TopicCount"] = int(topicChunk) for chunk in chunks: if chunk[0] == "q" and chunk[-1].isdigit( ): # This should be the query time in seconds queryTimeSeconds = int(chunk[1:]) queryTimeMinutes = queryTimeSeconds / 60 row["TrainTime"] = queryTimeMinutes # Expecting training file name argument of the form: "sourceData/first24hourOrderSets.2013.q86400.v14400.-12345.tab.gz" row["VerifyTime"] = None for chunk in row["args_0_"].split("."): if chunk[0] == "v" and chunk[-1].isdigit( ): # This should be the verify time in seconds verifyTimeSeconds = int(chunk[1:]) verifyTimeMinutes = verifyTimeSeconds / 60 row["VerifyTime"] = verifyTimeMinutes formatter.formatResultDict(row, outputCols) ifs.close() ofs.close()
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <trainFile> <testFile> [<outputFile>]\n"+\ " <trainFile> Tab-delimited file, queryItemIdsJSON expected to be parseable into lists of query items as well as an outcome.X column\n"+\ " <testFile> Same structure as trainFile, but with test cases to assess prediction scoring\n"+\ " <outputFile> Tab-delimited that can be used for ROC analysis with columns for outcome and predicted score\n"+\ "" parser = OptionParser(usage=usageStr) parser.add_option( "-o", "--outcomeItemId", dest="outcomeItemId", help="Outcome item IDs to assess get prediction scores for") (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 2: trainFile = stdOpen(args[0]) testFile = stdOpen(args[1]) outcomeId = int(options.outcomeItemId) # Run the actual analysis (featureMatrix, outcomeMatrix, queryIds, rowModels) = self.fileToMatrixes(trainFile, outcomeId) model = self.train(featureMatrix, outcomeMatrix) analysisResults = self.predict(testFile, model, queryIds, outcomeId) # Format the results for output outputFilename = None if len(args) > 2: outputFilename = args[2] outputFile = stdOpen(outputFilename, "w") # Print comment line with arguments to allow for deconstruction later as well as extra results print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile) colNames = self.analysisHeaders(outcomeId) analysisResults.insert(0, RowItemModel(colNames, colNames)) # Insert a mock record to get a header / label row formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Tab-delimited file, first two labeled columns expected to represent labeled outcome (0 and non-zero) and score/probability of outcome\n"+\ " <outputFile> Tab-delimited table specifying score histogram bin widths, total cases, predicted events, actual events\n"+\ " Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option("-b", "--bins", dest="nBins", default=10, help="Number of bins to separate scores into, defaults to deciles (10)"); parser.add_option("-f", "--figure", dest="figure", help="If set, will also try to auto-generate an example figure and store to a file here"); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); if len(args) > 1: inputFilename = args[0]; inputFile = stdOpen(inputFilename); # Run the actual analysis analysisResults = self(inputFile, int(options.nBins)); (hlStat, degFreedom, hlP) = self.calculateHosmerLemeshow(analysisResults); # Generate plot figure if options.figure is not None: self.generateFigure(analysisResults, options.figure); # Format the results for output outputFilename = None; if len(args) > 1: outputFilename = args[1]; outputFile = stdOpen(outputFilename,"w"); # Print comment line with arguments to allow for deconstruction later as well as extra results print >> outputFile, COMMENT_TAG, json.dumps({"argv":argv, "P-HosmerLemeshow": hlP}); colNames = self.analysisHeaders(); analysisResults.insert(0, RowItemModel(colNames,colNames) ); # Insert a mock record to get a header / label row formatter = TextResultsFormatter( outputFile ); formatter.formatResultDicts( analysisResults, colNames ); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def loadUpdateBufferFromFile(self, filename): updateBuffer = None; try: #print >> sys.stderr, filename log.info("Loading: %s" % filename); ifs = stdOpen(filename, "r") updateBuffer = json.load(ifs) updateBuffer["analyzedPatientItemIds"] = set(updateBuffer["analyzedPatientItemIds"]) ifs.close() except IOError as exc: # Apparently could not find the named filename. See if instead it's a prefix # for a series of enumerated files and then merge them into one mass buffer dirname = os.path.dirname(filename); if dirname == "": dirname = "."; # Implicitly the current working directory basename = os.path.basename(filename); for nextFilename in os.listdir(dirname): if nextFilename.startswith(basename): nextFilepath = os.path.join(dirname, nextFilename); nextUpdateBuffer = self.loadUpdateBufferFromFile(nextFilepath); if updateBuffer is None: # First update buffer, use it as base updateBuffer = nextUpdateBuffer; else: # Have existing update buffer. Just update it's contents with the next one updateBuffer = self.mergeBuffers(updateBuffer, nextUpdateBuffer); del nextUpdateBuffer; # Make sure memory gets reclaimed return updateBuffer;
def saveBufferToFile (self, filename, updateBuffer): ofs = stdOpen (filename, "w"); updateBuffer["analyzedPatientItemIds"] = list(updateBuffer["analyzedPatientItemIds"]); json.dump(updateBuffer, ofs); ofs.close(); # Wipe out buffer to reflect incremental changes done, so any new ones should be recorded fresh updateBuffer = self.makeUpdateBuffer(updateBuffer);
def main_mergeTTestResults(argv): mod = ConcatenateDataFiles(); fileListFile = stdOpen(BASE_RESULT_DIR+FILELIST_FILENAME, "w"); for resultDir in RESULT_DIRS: print(resultDir+TTEST_FILENAME, file=fileListFile); fileListFile.close(); subargv = ["ConcatenateDataFiles.py","-o", BASE_RESULT_DIR+CONCATENATE_FILENAME]; subargv.append(BASE_RESULT_DIR+FILELIST_FILENAME); mod.main(subargv);
def main(argv): timer = time.time() inFile = stdOpen(argv[1]) outFile = stdOpen(argv[2], "w") df = dataFrame = pd.read_table(inFile, parse_dates=DATE_COLS) df = binarizeGender(df) df = binarizeRace(df, RACE_OPTIONS, DEFAULT_RACE) df = binarizePeriod(df) df["hasDrugScreens"] = (df["nDrugScreens"] > 0) * 1 # Convert to binary outcome measure df.to_csv(outFile, sep="\t", index=False) elapsed = time.time() - timer print >> sys.stderr, "%s seconds to complete" % timedelta( 0, round(elapsed)) return df
def main (self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <patientIds>\n"+\ " <patientIds> Comma-separated list of patient IDs to run the analysis on, or use option to specify a file.\n" parser = OptionParser(usage=usageStr) parser.add_option("-i", "--idFile", dest="idFile", help="If provided, look for patient IDs in then named file, one ID per line, in the format '/Users/Muthu/Desktop/JonathanChen/patientlist.txt'") parser.add_option("-s", "--startDate", dest="startDate", metavar="<startDate>", help="Date string (e.g., 2011-12-15), must be provided, will start analysis on items occuring on or after this date."); parser.add_option("-e", "--endDate", dest="endDate", metavar="<endDate>", help="Date string (e.g., 2011-12-15), must be provided, will stop analysis on items occuring before this date."); parser.add_option("-w", "--window", type="int", dest="window", metavar="<window>", help="Window integer (e.g., 36), (unit is deltas, i.e. a window of 36 and a delta of 4 weeks means that after 36 x4 weeks, the data is decayed ~1/e ~ 0.37). More precisely, the window x delta is how long it will take for the data to decay to 38 percent of its original worth. Higher delta means it takes longer to decay. This number must be provided."); parser.add_option("-d", "--delta", type="int", dest="delta", metavar="<delta>", help="Delta integer (e.g., 4), (unit of time is weeks, defaults to 4 weeks), define in what increments do you want to read in the data. After each increment/delta, it performs a decay."); parser.add_option("-a", "--associationsPerCommit", type="int", dest="associationsPerCommit", help="If provided, will commit incremental analysis results to the database when accrue this many association items. Can help to avoid allowing accrual of too much buffered items whose runtime memory will exceed the 32bit 2GB program limit.") parser.add_option("-u", "--itemsPerUpdate", type="int", dest="itemsPerUpdate", help="If provided, when updating patient_item analyze_dates, will only update this many items at a time to avoid overloading MySQL query.") parser.add_option("-o", "--outputFile", dest="outputFile", help="If provided, send buffer to output file rather than commiting to database") (options, args) = parser.parse_args(argv[1:]) decayAnalysisOptions = DecayAnalysisOptions() log.debug("starting process"); #set start and end dates, item length (delta), and decay rate decayAnalysisOptions.startD = datetime.strptime(options.startDate, DATE_FORMAT) #makes a datetime object for the start and end date decayAnalysisOptions.endD = datetime.strptime(options.endDate, DATE_FORMAT) decayAnalysisOptions.windowLength = options.window #how many deltas in your window decayAnalysisOptions.delta = timedelta(weeks=options.delta) if options.associationsPerCommit is not None: decayAnalysisOptions.associationsPerCommit = options.associationsPerCommit if options.itemsPerUpdate is not None: decayAnalysisOptions.itemsPerUpdate = options.itemsPerUpdate if options.delta != None: decayAnalysisOptions.delta = timedelta(weeks=(options.delta)) #length of one decay item if options.outputFile is not None: decayAnalysisOptions.outputFile = options.outputFile #set patientIds based on either a file input or args decayAnalysisOptions.patientIds = list() if len(args) > 0: decayAnalysisOptions.patientIds.extend(args[0].split(",")) if options.idFile is not None: idFile = stdOpen(options.idFile) for line in idFile: decayAnalysisOptions.patientIds.append(line.strip()) #quit if invalid parameters if decayAnalysisOptions.startD is None or decayAnalysisOptions.endD is None or options.window is None or options.window == 0 or decayAnalysisOptions.patientIds is None: parser.print_help() sys.exit(0) log.debug("global start and end date"); log.debug(decayAnalysisOptions.startD, decayAnalysisOptions.endD, decayAnalysisOptions.windowLength); self.decayAnalyzePatientItems(decayAnalysisOptions)
def main_concatenate(argv): mod = ConcatenateDataFiles(); for resultDir in RESULT_DIRS: fileListFile = stdOpen(resultDir+FILELIST_FILENAME, "w"); for filename in os.listdir(resultDir): if filename.startswith(RESULT_BASENAME): print(resultDir+filename, file=fileListFile); fileListFile.close(); subargv = ["ConcatenateDataFiles.py","-o",resultDir+CONCATENATE_FILENAME]; subargv.append(resultDir+FILELIST_FILENAME); mod.main(subargv);
def parsePatientFile(patientFile, colNames): log.info("Parse patient file"); patientFile = stdOpen("patients.tab"); patientById = dict(); for patient in TabDictReader(patientFile): patientId = int(patient["patient_id"]); patient["patient_id"] = patientId; patientById[patientId] = patient; colNames.extend(["patient_id","dialysis","surgery"]); return patientById;
def loadDocCountByWordId(self, filename): """Given the name of a top topics file, load the section reporting the overall word document counts """ docCountByWordId = dict() reader = TabDictReader(stdOpen(filename)) for topicItem in reader: if topicItem[ "topic_id"] == NULL_STRING: # All document section, not topic specific itemId = None if topicItem["item_id"] != NULL_STRING: itemId = int(topicItem["item_id"]) docCount = int(topicItem["score"]) docCountByWordId[itemId] = docCount return docCountByWordId
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> <outputFile>\n"+\ " <inputFile> Tab-delimited file of relational data. Specify \"-\" to read from stdin.\n"+\ " <ouputFile> Tab-delimited file relational query data results. Specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-q", "--sqlQuery", dest="sqlQuery", help= "SQL Query to execute on the input data file/table. Use default tablename '%s' in query." % DEFAULT_TABLENAME) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 1: inputFile = stdOpen(args[0]) outputFile = stdOpen(args[1], "w") sqlQuery = options.sqlQuery # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print >> outputFile, COMMENT_TAG, json.dumps(summaryData) # Primary execution to load inputFile, run query, then drop results into outputFile dataFrame = self(sqlQuery, inputFile, outputFile) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(argv): timer = time.time() patientById = queryPatients(stdOpen("patients.tab", "w")) #queryLabResults(stdOpen("labs.tab","w"), patientById); # Time intensive, ~20 minutes queryClinicalItems(stdOpen("transfusions.tab", "w"), (3648, ), patientById) # RBC Transfusions queryClinicalItems(stdOpen("problemListDx.tab", "w"), (14568, 14606, 14847, 20636), patientById) # Iron Def Anemia Problem List queryClinicalItems(stdOpen("admitDx.tab", "w"), (17172, 20125, 21873), patientById) # Iron Def Admission Diagnosis ################################################## # Iron prescription ID notes # Mostly oral supplements and vitamins, dominated by ferrous sulfate ironSulfateItemIds = (34, 1044, 1047, 1280, 1386) # Mostly first one for PO route, others are smattering of feeding tube route # Dominated by first PO Multi-Vitamin (2800 vs 90 for second) otherEnteralIronItemIds = (83, 208, 349, 732, 1188, 1376, 1460, 1707, 1768, 1996, 2000, 2085, 2140, 2162, 2322, 2569, 2855, 3124, 3130, 3234, 3241, 3242, 3305, 3309, 3367, 3380, 3384, 3414, 3532) allEnteralIronItemIds = set(ironSulfateItemIds).union( otherEnteralIronItemIds) # IV iron formulations ivIronClinicalItemIds = (893, 1129, 720, 1304, 1490, 3403) queryClinicalItems(stdOpen("feSO4Rx.tab", "w"), ironSulfateItemIds, patientById) # FeSO4 (Enteral, primarily PO) queryClinicalItems(stdOpen("allEnteralIron.tab", "w"), allEnteralIronItemIds, patientById) # All Enteral Iron formulations, including FeSO4, FeGluconate, and assorted MVI, etc. queryClinicalItems(stdOpen("ironIV.tab", "w"), ivIronClinicalItemIds, patientById) # IV Iron (sucrose, dextran, gluconate, etc.) queryOutpatientIronRx(stdOpen("outpatientIronRx.tab", "w"), patientById) timer = time.time() - timer print >> sys.stderr, "%.3f seconds to complete" % timer
def main(self, argv): """Main method, callable from command line""" usageStr = "Query for the clinical_item records that exist with the specified criteria\n"+\ "usage: %prog [options] [<outputFile>]\n"+\ " <outputFile> Results file. Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-i", "--itemPrefix", dest="itemPrefix", help= "Look for clinical_items whose description starts with this prefix." ) parser.add_option( "-c", "--categoryNames", dest="categoryNames", help= "Comma separated list of clinical_item_category.descriptions to look for." ) parser.add_option( "-p", "--pauseSeconds", dest="pauseSeconds", default="0", help="Number of seconds to pause between processing each record.") (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 0: outputFile = stdOpen(args[0], "w") # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) self.queryItems(options, outputFile) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <patientIds>\n"+\ " <patientIds> Patient ID file, or comma-separated list of patient IDs.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-s", "--itemIdSequence", dest="itemIdSequence", help= "Comma-separated sequence of item IDs to look for as representing the end of a triple of interest." ) parser.add_option( "-v", "--virtualItemId", dest="virtualItemId", help= "ID of virtual clinical item to record against if find a specified triple." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() patientIds = set() patientIdsParam = args[0] try: # Try to open patient IDs as a file patientIdFile = stdOpen(patientIdsParam) patientIds.update(patientIdFile.read().split()) except IOError: # Unable to open as a filename, then interpret as simple comma-separated list patientIds.update(patientIdsParam.split(",")) itemIdSequence = [ int(idStr) for idStr in options.itemIdSequence.split(",") ] virtualItemId = int(options.virtualItemId) self.analyzePatientItems(patientIds, itemIdSequence, virtualItemId) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def transform_STRIDE_source_table(stride_source_table): # Get module for doing data conversion. transformer = ClinicalItemDataLoader.STRIDE_TABLE_TRANSFORMER_MAP[ stride_source_table] # Build command. if stride_source_table == 'stride_patient': argv = ['python', '-m', transformer] elif stride_source_table == 'stride_preadmit_med': argv = ['python', '-m', transformer, '-m', '5', '-s', '2008-01-01'] elif stride_source_table == 'stride_order_med': argv = [ 'python', '-m', transformer, '-m', '5', '-d', '5', '-s', '2008-01-01' ] elif stride_source_table == 'stride_treatment_team': argv = ['python', '-m', transformer, '-a', '-s', '2008-01-01'] else: argv = ['python', '-m', transformer, '-a', '-s', '2008-01-01'] # Call command. log_file = stdOpen('%s.log' % ('_'.join(argv)), 'w') subprocess.call(argv, stderr=log_file)
'FERRITIN': 'FERRITIN(FER)', 'TBIL': 'TOTAL BILIRUBIN(TBIL)', 'WBC': 'WBC(WBC)', 'CR': 'CREATININE, SER/PLAS(CR)', 'HAPTO': 'HAPTOGLOBIN(HAP)', 'MCV': 'MCV(MCV)', 'RETICAB': 'RETIC, ABS(RETABS)', 'HGB': 'HGB(CALC), ISTAT', 'YSTFRR': 'SOL TRANSFERR REC', 'TRFSAT': 'TRANSFERRIN SAT', 'FE': 'IRON, TOTAL' } timer = time.time() featureMatrixFile = stdOpen("featureMatrix.tab") log.info("Parse feature matrix file") patientById = dict() for patient in TabDictReader(featureMatrixFile): patientId = int(patient["patient_id"]) patient["patient_id"] = patientId for labBaseName in labBaseNames: if patient[labBaseName] == NULL_STRING: patient[labBaseName] = None else: patient[labBaseName] = float(patient[labBaseName]) patientById[patientId] = patient log.info("Create plots against each metric against the index lab") for labBaseName in labBaseNames:
def main(argv=None): timer = time.time() extractor = DataExtractor() # Output file featureMatrixFile = stdOpen("featureMatrix.SepsisICU.encounters.tab.gz", "w") # Final columns to output to patient matrix colNames = list() patientEpisodes = extractor.parsePatientEpisodeFile( stdOpen("patientEpisodes.tab"), colNames) #patientIds = set(columnFromModelList(patientEpisodes, "patient_id")); log.info("Expand to index dates based start and end dates") # But only want one entry per patient patientByIndexTimeById = extractor.generateDateRangeIndexTimes( "edAdmitTime", "dischargeTime", patientEpisodes, colNames, timeInterval=None) log.info("Populate flowsheet summary statistics") flowsheetByNameByPatientId = extractor.parseFlowsheetFile( stdOpen("Flowsheet.tab.gz")) extractor.addFlowsheetFeatures(patientByIndexTimeById, flowsheetByNameByPatientId, FLOWSHEET_NAMES, FLOWSHEET_PRE_TIME_DELTA, FLOWSHEET_POST_TIME_DELTA, colNames) log.info("Populate laboratory result summary statistics") labsByBaseNameByPatientId = extractor.parseLabResultsFile( stdOpen("LabResults.tab.gz")) extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId, LAB_BASE_NAMES, LAB_PRE_TIME_DELTA, LAB_POST_TIME_DELTA, colNames) log.info("Populate IV Fluid accumulation") ivFluidsByPatientId = extractor.parseIVFluidFile( stdOpen("IsotonicIVFluids.tab.gz")) extractor.addIVFluidFeatures(patientByIndexTimeById, ivFluidsByPatientId, IVF_THRESHOLD_VOLUMES, IVF_CHECKPOINT_TIMES, colNames) log.info( "Record presence of items in terms of relative time to each item from index time" ) extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("IVAntibiotic.tab")), patientByIndexTimeById, colNames, "IVAntibiotic") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("BloodCulture.tab")), patientByIndexTimeById, colNames, "BloodCulture") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RespViralPanel.tab")), patientByIndexTimeById, colNames, "RespViralPanel") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")), patientByIndexTimeById, colNames, "AnyICULifeSupport") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")), patientByIndexTimeById, colNames, "AnyDNR") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")), patientByIndexTimeById, colNames, "AnyVasoactive") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")), patientByIndexTimeById, colNames, "AnyCRRT") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")), patientByIndexTimeById, colNames, "AnyVentilator") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")), patientByIndexTimeById, colNames, "ComfortCare") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")), patientByIndexTimeById, colNames, "PalliativeConsult") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Death.tab")), patientByIndexTimeById, colNames, "Death") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Birth.tab")), patientByIndexTimeById, colNames, "Birth") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Male.tab")), patientByIndexTimeById, colNames, "Male") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Female.tab")), patientByIndexTimeById, colNames, "Female") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile( stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceWhiteNonHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")), patientByIndexTimeById, colNames, "RaceAsian") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile( stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceWhiteHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")), patientByIndexTimeById, colNames, "RaceUnknown") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")), patientByIndexTimeById, colNames, "RaceOther") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")), patientByIndexTimeById, colNames, "RaceBlack") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")), patientByIndexTimeById, colNames, "RacePacificIslander") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")), patientByIndexTimeById, colNames, "RaceNativeAmerican") log.info( "Systemically Scan for Charlson comorbidities and Treatment Team categories" ) for filename in os.listdir("."): if filename.startswith(CHARLSON_PREFIX): diseaseName = filename if filename.endswith(".tab"): diseaseName = filename[:-len(".tab")] extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen(filename)), patientByIndexTimeById, colNames, diseaseName) if filename.startswith(TREATMENT_TEAM_PREFIX): teamName = filename if filename.endswith(".tab"): teamName = filename[:-len(".tab")] extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen(filename)), patientByIndexTimeById, colNames, teamName) log.info("Output feature matrix file with row per patient day") formatter = TextResultsFormatter(featureMatrixFile) formatter.formatTuple(colNames) for patientId, patientByIndexTime in patientByIndexTimeById.iteritems(): patientResults = patientByIndexTime.values() formatter.formatResultDicts(patientResults, colNames) timer = time.time() - timer print >> sys.stderr, "%.3f seconds to complete" % timer
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> <outputFile>\n"+\ " <inputFile> Tab-delimited file of data\n"+\ " <ouputFile> Tab-delimited file with relational table of t-test p-values for each sub-group pair. Specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-l", "--labelCols", dest="labelCols", help= "Comma-separated list of the column headers to label data rows as belonging to different subgroups" ) parser.add_option( "-v", "--valueCols", dest="valueCols", help= "Comma-separated list of the column headers for data values want to calculate statistics for" ) parser.add_option( "-m", "--matchCols", dest="matchCols", help= "Comma-separated list of the column headers to match groups on, like row identifiers. If not exists, then do independent t-tests rather than paired." ) parser.add_option( "-b", "--baseLabels", dest="baseLabels", help= "Comma-separated list of values that the labelCols should have to represent which base method to compare all other methods to as a reference (otherwise do a full n^2 cartesian product of all combinations)." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 1: inputFile = stdOpen(args[0]) outputFile = stdOpen(args[1], "w") labelCols = options.labelCols.split(",") valueCols = options.valueCols.split(",") matchCols = None if options.matchCols: matchCols = options.matchCols.split(",") baseLabels = None if options.baseLabels: baseLabels = options.baseLabels.split(",") # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print >> outputFile, COMMENT_TAG, json.dumps(summaryData) # Tab-delimited output formatting formatter = TextResultsFormatter(outputFile) # Prep generator first, so will be able to extract out relevant header columns rowGenerator = self(inputFile, labelCols, valueCols, matchCols, baseLabels) # Insert a mock record to get a header / label row colNames = self.resultHeaders(labelCols, valueCols, matchCols) formatter.formatResultDict(RowItemModel(colNames, colNames), colNames) # Stream the concatenated data rows to the output to avoid storing all in memory for outputDict in rowGenerator: formatter.formatResultDict(outputDict, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
"Charlson.Diabetes.pre", "Charlson.DiabetesComplications.pre", "Charlson.HemiplegiaParaplegia.pre", "Charlson.LiverMild.pre", "Charlson.LiverModSevere.pre", "Charlson.Malignancy.pre", "Charlson.MalignancyMetastatic.pre", "Charlson.MI.pre", "Charlson.PepticUlcer.pre", "Charlson.PeripheralVascular.pre", "Charlson.Renal.pre", "Charlson.Rheumatic.pre", "self_pay", "PO2A.last", "Pulse.last", "NA.last", "CR.last", "HCT.last", "WBC.last", "BUN.last", "TBIL.last", "K.last", "Resp.last", "Temp.last", "Urine.last", "BP_Low_Diastolic.last", "BP_High_Systolic.last", "Glasgow.Coma.Scale.Score.last", "TT.Cardiology.pre", "TT.CCU.HF.pre", "TT.CCU.pre", "TT.HemeOnc.pre", "TT.Medicine.pre", "TT.MICU.pre", "TT.Neurology.pre", "TT.SICU.pre", "TT.SurgerySpecialty.pre", "TT.Transplant.pre", "TT.Trauma.pre", "self_pay" ] ofs = stdOpen("simulatedData.ICUDNR.tab", "w") formatter = TextResultsFormatter(ofs) formatter.formatTuple(colNames) # Header row random.seed(987654321) # Consistent seed for reproducibility nPatients = 10000 # Random generator parameters ageRange = [30, 80] incomeRange = [20000, 200000] incomeStep = 1000 femaleRate = 0.5 # Ranges on uniform distribution to assign race labels. Leave ~50% empty for default White race
def main(argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <query> [<outputFile>]\n"+\ " <query> Query to execute (probably enclosed in quotes (\"))\n"+\ " <outputFile> If query yields a result set, then that will be output\n"+\ " to the named file. Specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option("-c", "--incCols", dest="incCols", action="store_true", help="If set when executing a SELECT statement, then a line will be added before the result set with the names of the data columns.") parser.add_option("-C", "--incCommand", dest="incCommand", action="store_true", help="If set when executing a SELECT statement, then add a comment header line with the command-line argv.") parser.add_option("-s", "--script", dest="script", action="store_true", help="Interpret the first argument <query> as the name of a DB (SQL) script to run instead. Use \"-\" to specify stdin.") parser.add_option("-i", "--input", dest="input", metavar="<inputFile>", help="Open the named whitespace-delimted file and insert its contents into the database. Use with -t option. The remaining \"normal\" arguments are expected and will be taken as the ordered list of column names the file data is to be inserted under. Alternatively, the first row of the file will be considered the column names. Use \"-\" to specify stdin.") parser.add_option("-u", "--update", dest="update", metavar="<dataFile>", help="Open the named whitespace-delimted file and update its contents into the database. Use with -t and -n options. The remaining \"normal\" arguments are expected and will be taken as the ordered list of column names the file data is to be updated under. Alternatively, the first row of the file will be considered the column names. Use \"-\" to specify stdin."); parser.add_option("-t", "--table", dest="table", metavar="<tableName>", help="If inserting / updating a file with the -i or -u option, specify the name of the DB table to insert into") parser.add_option("-d", "--delim", dest="delim", metavar="<delimiter>", help="If inserting / updating a file with the -i or -u option, specify the character to delimit values by. Default to \\t tabs, but can specify something else. Alternatively, this can be used to specify what delimiter to use when formatting query output.") parser.add_option("-n", "--nIdCols", dest="nIdCols", default="1", help="If updating a file with the -u option, assume that the first column is the ID column not to update into the database, but to identify the respective row to update. If more than 1 column is needed, specify with this option.") parser.add_option("-o", "--output", dest="output", metavar="<outputFile>", help="If inserting a file with the -i option and want to get generated ID numbers from the inserted rows, specify this file to send them to.") parser.add_option("-e", "--skipErrors", dest="skipErrors", action="store_true", help="If inserting or updating a file or running a script with the -s option, keep running the remainder of the inserts or script commands even if one causes an exception.") parser.add_option("-f", "--dateColFormats", dest="dateColFormats", metavar="<dateColFormats>", help="If inserting a file, can specify columns that should be interpreted as date strings to be parsed into datetime objects. Provide comma-separated list, and optional | separated Python date parsing format (e.g., 'MyDateTime1|%m/%d/%Y %H:%M:%S,MyDateTime2'). http://docs.python.org/library/datetime.html#strftime-strptime-behavior.") parser.add_option("-x", "--escapeStrings", dest="escapeStrings", action="store_true", help="If inserting a file, can set whether to run all input strings through escape filter to avoid special characters compromising inserts.") (options, args) = parser.parse_args(argv[1:]) # Correct escape character delimiter if options.delim == "\\t": options.delim = "\t"; log.info("Starting: "+str.join(" ", argv)) timer = time.time(); if options.script and len(args) > 0: runDBScript( stdOpen(args[0],"r",sys.stdin), options.skipErrors ) elif options.input is not None and options.table is not None: inputFile = stdOpen(options.input,"r",sys.stdin) outputFile = None if options.output != None: outputFile = stdOpen(options.output,"w",sys.stdout) dateColFormats = None; if options.dateColFormats is not None: dateColFormats = dict(); colDateFormatComponents = options.dateColFormats.split(","); for colDateFormatComponent in colDateFormatComponents: colFormatChunks = colDateFormatComponent.split("|"); colName = colFormatChunks[0]; formatStr = None; if len(colFormatChunks) > 1: formatStr = colFormatChunks[1]; dateColFormats[colName] = formatStr; # If reading from a file (not standard input stream), do an extra pass to get size estimate to facilitate progress tracker estInput = None; if not isStdFile(options.input): lineCountFile = stdOpen(options.input); estInput = fileLineCount(lineCountFile); nInserts = insertFile( inputFile, options.table, args, options.delim, outputFile, options.skipErrors, dateColFormats=dateColFormats, escapeStrings=options.escapeStrings, estInput=estInput ); log.info("%d rows successfully inserted",nInserts) elif options.update is not None and options.table is not None: sourceFile = stdOpen(options.update,"r",sys.stdin); nIdCols = int(options.nIdCols); nUpdates = updateFromFile( sourceFile, options.table, args, nIdCols, options.delim, options.skipErrors ); log.info("%d row updates completed",nUpdates); elif len(args) > 0: outFile = "-" # Default to stdout if no outputFile specified if len(args) > 1: outFile = args[1] outFile = stdOpen( outFile, "w", sys.stdout ) if options.incCommand: summaryData = {"argv": argv}; print >> outFile, COMMENT_TAG, json.dumps(summaryData); textFormatter = TextResultsFormatter(outFile, options.delim) results = execute( args[0], formatter=textFormatter, includeColumnNames=options.incCols ) log.info("%d rows affected (or other return code)",results); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);