def test_numRecsByOrderSet(self): # Designate number of recommendations indirectly via linked order set id DBUtil.execute("update clinical_item set default_recommend = 0 where clinical_item_id = -8"); # Disable default recommend on one item to shift results colNames = ["patient_id", "TP", "FN", "FP", "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"]; expectedResults = [ RowItemModel([-11111, 2, 0, 3, 1.0, 0.4, 0.571, 1.0, 0.3178, 0.4167], colNames ) ]; # Do through fabricated prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","2","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); # Artificially add a key order set ID for the fabricated data modFile = StringIO(); formatter = TextResultsFormatter(modFile); dataCols = None; for i, dataRow in enumerate(TabDictReader(preparedDataFile)): dataRow["order_set_id"] = TEST_ORDERSET_ID; if i <= 0: dataCols = list(dataRow.keys()); formatter.formatTuple(dataCols); # Insert a mock record to get a header / label row formatter.formatResultDict(dataRow, dataCols); preparedDataFile = StringIO(modFile.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); #argv = ["RecommendationClassificationAnalysis.py","-P","-r","5","-m","0","-R","ItemAssociationRecommender",'-',"-"]; argv = ["RecommendationClassificationAnalysis.py","-P","--numRecsByOrderSet","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames);
def main_formatMergedTTests(argv): ifs = stdOpen(BASE_RESULT_DIR+CONCATENATE_FILENAME); ofs = stdOpen(BASE_RESULT_DIR+FILTERED_FILENAME, "w"); summaryData = {"argv": argv}; print >> ofs, COMMENT_TAG, json.dumps(summaryData); outputCols = ["SortType","TopicCount","VerifyTime","Group1.precision.mean","Group1.recall.mean","Group1.normalprecision.mean","Group1.weightrecall.mean","Group1.roc_auc.mean","ttest_rel.precision","ttest_rel.recall","ttest_rel.weightrecall","ttest_rel.roc_auc","Group1.numqueryitems.mean","Group1.numverifyitems.mean","Group1.numrecommendeditems.mean","Group1.tp.mean"]; formatter = TextResultsFormatter(ofs); formatter.formatTuple(outputCols); # Output header row reader = TabDictReader(ifs); for row in reader: row["SortType"] = row["Group1._s"]; # Extract out numerical data from filename text parameters row["TopicCount"] = None; if row["Group1._m"] != 'None': # Expecting model name strings of the form: "models/topicModel.first24hourItems.2013.1234567890.filter.bow.gz.64Topic.model" topicChunk = row["Group1._m"].split(".")[-2]; # Expect second to last period-delimited chunk to contain topic count topicChunk = topicChunk[:topicChunk.find("Topic")]; # Remove trailing Topic text row["TopicCount"] = int(topicChunk); # Expecting result file name argument of the form: "results/byOrderSets/01minutes/filteredResults.tab.gz" timeChunk = row["args[0]"].split("/")[-2]; timeChunk = timeChunk[:timeChunk.find("minutes")]; row["VerifyTime"] = int(timeChunk); formatter.formatResultDict(row, outputCols); ifs.close(); ofs.close();
def parseLabResultsFile(labFile): log.info("Parse lab results file"); prog = ProgressDots(); labsByBaseNameByPatientId = dict(); # Dictionary of dictionaries of lists of result items for labResult in TabDictReader(labFile): if labResult["ord_num_value"] is not None and labResult["ord_num_value"] != NULL_STRING: patientId = int(labResult["pat_id"]); labBaseName = labResult["base_name"]; resultValue = float(labResult["ord_num_value"]); resultTime = DBUtil.parseDateValue(labResult["result_time"]); if resultValue < LAB_SENTINEL_VALUE: # Skip apparent placeholder values labResult["pat_id"] = labResult["patient_id"] = patientId; labResult["ord_num_value"] = resultValue; labResult["result_time"] = resultTime; if patientId not in labsByBaseNameByPatientId: labsByBaseNameByPatientId[patientId] = dict(); if labBaseName not in labsByBaseNameByPatientId[patientId]: labsByBaseNameByPatientId[patientId][labBaseName] = list(); labsByBaseNameByPatientId[patientId][labBaseName].append( labResult ); prog.update(); prog.printStatus(); return labsByBaseNameByPatientId;
def loadDocCountByWordId(self, filename): """Given the name of a top topics file, load the section reporting the overall word document counts """ docCountByWordId = dict() reader = TabDictReader(stdOpen(filename)) for topicItem in reader: if topicItem[ "topic_id"] == NULL_STRING: # All document section, not topic specific itemId = None if topicItem["item_id"] != NULL_STRING: itemId = int(topicItem["item_id"]) docCount = int(topicItem["score"]) docCountByWordId[itemId] = docCount reader.close() return docCountByWordId
def parseScoreModelsFromFile(self, inputFile, colOutcome=None, scoreCols=None): """Structured variant of above. Assume named columns and just return combined dictionary / RowItemModels """ scoreModels = list() for scoreModel in TabDictReader(inputFile): # Data parsing for any named columns if colOutcome is not None: outcome = OUTCOME_PRESENT if scoreModel[colOutcome] in NEGATIVE_OUTCOME_STRS: outcome = OUTCOME_ABSENT scoreModel[colOutcome] = outcome # Temporary hack to get P-Fisher-NegLog into dataset import math if scoreCols is not None and "P-Fisher-NegLog" in scoreCols: p = float(scoreModel["P-Fisher"]) logP = -sys.float_info.max if p > 0.0: logP = math.log(p, 10) if scoreModel["OR"] > 1.0: logP *= -1 scoreModel["P-Fisher-NegLog"] = logP if scoreCols is not None: for colScore in scoreCols: scoreModel[colScore] = float(scoreModel[colScore]) scoreModels.append(scoreModel) return scoreModels
def test_merge(self): # Simulate data files inputFiles = \ [ # JSON style header StringIO \ ("""# {"argv": ["medinfo\\\\Score.py", "-R", "ItemAssociationRecommender", "-c", "135", "-Q", "14400", "-V", "86400", "-r", "10", "-a", "unweighted", "177340", "test.out"]} id\toutcome\tscore 1\t0\t0.01 2\t0\t0.02 3\t1\t0.13 """), # Simple list style header StringIO \ ("""# ["medinfo\\\\Score.py", "-R", "ItemAssociationRecommender", "-c", "135", "-Q", "14400", "-V", "86400", "-r", "10", "-a", "weighted", "-s", "PPV", "177340", "test2.out"] id\toutcome\tscore2 1\t0\t0.15 3\t1\t0.31 4\t1\t0.23 """), # Extra comment StringIO \ ("""# Generic extra comment + True/False option # ["medinfo\\\\Score.py", "-X", "-R", "ItemAssociationRecommender", "-c", "135", "-Q", "14400", "-V", "86400", "-r", "10", "-a", "weighted", "-s", "prevalence", "141", "test3.out"] id\toutcome\tscore2 5\t2\t0.15 1\t0\t0.22 3\t1\t0.42 """), # No header comment StringIO \ ("""id\toutcome\tscore 5\t2\t0.25 6\t3\t0.52 4\t1\t0.82 """), ] # Call application keyCol = ["id", "outcome"] # Key columns to expect to be the same suffixList = [".A", ".B", ".C", ".D"] # Force suffixes to be added to all other columns colNames = [ "id", "outcome", "score.A", "score2.B", "score2.C", "score.D" ] outFile = StringIO() self.analyzer(inputFiles, keyCol, suffixList, outFile) # Read output back into structured objects, and validate matches expected testResults = list(TabDictReader(StringIO(outFile.getvalue()))) expectedResults = \ [ dict( zip(colNames, ["1","0","0.01","0.15","0.22", "nan"]) ), dict( zip(colNames, ["2","0","0.02", "nan", "nan", "nan"]) ), dict( zip(colNames, ["3","1","0.13","0.31","0.42", "nan"]) ), dict( zip(colNames, ["4","1", "nan","0.23", "nan","0.82"]) ), dict( zip(colNames, ["5","2", "nan", "nan","0.15","0.25"]) ), dict( zip(colNames, ["6","3", "nan", "nan", "nan","0.52"]) ), ] self.assertEqualList(expectedResults, testResults)
def test_TabDictReader(self): """Verify expected results when reading from different delimited file examples, particularly cases of messed up quoting or internal delimiter characters.""" inFileStr = \ """# Test comment line order_proc_id,"pat_id",pat_enc_csn_id,ordering_date,"order_type",proc_id,"proc_code","description","display_name","cpt_code","proc_cat_name","order_class","authrzing_prov_id","abnormal_yn","lab_status","order_status",quantity,"future_or_stand",standing_exp_date,standing_occurs,stand_orig_occur,"radiology_status",proc_bgn_time,proc_end_time,order_inst,"stand_interval","discrete_interval",instantiated_time,order_time,result_time,proc_start_time,problem_list_id,proc_ending_time,chng_order_proc_id,last_stand_perf_dt,last_stand_perf_tm,parent_ce_order_id,"ordering_mode" 3488535,"7229924684871",444976,10/12/2009 00:00,"Nursing",472897,"NUR1018","MONITOR INTAKE AND OUTPUT","Monitor Intake And Output","NUR1018","NURSING - ASSESSMENT","Hospital Performed","376355","","","Sent",1,"",,,,"",,,10/12/2009 00:17,"","",10/12/2009 00:17,10/12/2009 00:17,,10/12/2009 04:00,,10/12/2009 00:00,,,,,"Inpatient" 4530091,"11715476458129",417026,11/19/2009 00:00,"Nursing",498171,"NUR1940","NURSING COMMUNICATION","Give patient "Bedside Insulin Pump Flow Sheet" to document insulin delivery, BG and carbohydrates","NUR1940","NURSING - TREATMENT","Hospital Performed","355432","","","",1,"S",,,1,"",,,11/19/2009 11:55,"CONTINUOUS","",,11/19/2009 11:55,,11/19/2009 12:00,,,,11/19/2009 00:00,,,"Inpatient" 5905631,"10720939760322",387975,01/16/2010 00:00,"Nursing",473324,"NUR1182","SIGN ABOVE BED 'DO NOT REPOSITION NG'","Sign above bed"Do not reposition NG"","NUR1182","NURSING - DRAINS AND TUBES","Hospital Performed","314969","","","Canceled",1,"S",,,1,"",,,01/16/2010 18:31,"CONTINUOUS","",,01/16/2010 18:31,,01/16/2010 18:45,,,,01/16/2010 00:00,,,"Inpatient" """ inFile = StringIO(inFileStr) reader = TabDictReader(inFile, delimiter=",") parsedData = list(reader) # Convert to an in-memory list expectedData = \ [ {"order_proc_id":"3488535", "display_name":"Monitor Intake And Output"}, {"order_proc_id":"4530091", "display_name":"Give patient \"Bedside Insulin Pump Flow Sheet\" to document insulin delivery, BG and carbohydrates"}, {"order_proc_id":"5905631", "display_name":"Sign above bed\"Do not reposition NG\""}, ] targetKeys = list(expectedData[0].keys()) # Check a subset of values for simplicity self.assertEqualDictList(expectedData, parsedData, targetKeys) # Another test on messed up order_med end double quote inFileStr = \ """order_med_id,pat_id,pat_enc_csn_id,ordering_date,ORDER_CLASS_C,order_class_name,MEDICATION_ID,description,QUANTITY,REFILLS,start_taking_time,order_end_time,end_taking_time,RSN_FOR_DISCON_C,rsn_for_discon,MED_PRESC_PROV_ID,DISPLAY_NAME,ORDER_PRIORITY_C,order_priority,MED_ROUTE_C,med_route,discon_time,CHNG_ORDER_MED_ID,HV_DISCR_FREQ_ID,freq_name,discrete_frequency,HV_DISCRETE_DOSE,HV_DOSE_UNIT_C,hv_dose_unit,ORDER_STATUS_C,order_status,AUTHRZING_PROV_ID,ORD_PROV_ID,MIN_DISCRETE_DOSE,MAX_DISCRETE_DOSE,DOSE_UNIT_C,dose_unit,PAT_LOC_ID,department_name,MODIFY_TRACK_C,modify_track,ACT_ORDER_C,active_order,LASTDOSE,AMB_MED_DISP_NAME,REFILLS_REMAINING,RESUME_STATUS_C,resume_status,ORDERING_MODE_C,ordering_mode,MED_DIS_DISP_QTY,MED_DIS_DISP_UNIT_C,dispense_unit,number_of_doses,doses_remaining,min_rate,max_rate,rate_unit_c,rate_unit,min_duration,max_duration,med_duration_unit_c,duration_unit_name,min_volume,max_volume,volume_unit_c,volume_unit,calc_volume_yn,calc_min_dose,calc_max_dose,calc_dose_unit_c,calc_dose_unit,admin_min_dose,admin_max_dose,admin_dose_unit_c,admin_dose_unit 4880261,-11815067487752,418519,09/16/2009 11:49,8,"Fax",2567,"DOCUSATE SODIUM 50 MG/5 ML PO LIQD","","",09/16/2009 12:00,09/17/2009 21:26,09/18/2009 04:26,,"",360247,"docusate (COLACE) capsule 250 mg",,"",15,"Oral",09/18/2009 04:26,,"200006","2 TIMES DAILY","BID","250",3,"mg",9,"Discontinued",360247,377971,250,,3,"mg",2000262,"E2-ICU","","",3,"Discontinued Medication","","",,,"",2,"Inpatient",,,"",,,,,,"",,,,"",,,,"","Y",250,,3,"mg",1,,5003,"Cap" 5226027,-5331130233402,455118,06/26/2011 07:56,1,"Normal",8751,"WARFARIN 5 MG PO TABS","","",06/26/2011 18:00,06/27/2011 07:44,06/27/2011 14:44,,"",345517,"warfarin (COUMADIN) tablet 5 mg "Pharmacy Protocol"",,"",15,"Oral",06/27/2011 14:44,385146643,"200023","DAILY","QPM","5",3,"mg",9,"Discontinued",345517,372188,5,,3,"mg",2000273,"D3","2","MODIFIED",3,"Discontinued Medication","","",,,"",2,"Inpatient",,,"",,,,,,"",,,,"",,,,"","",5,,3,"mg",1,,5002,"Tab" 3579366,4662062643677,385175,04/18/2013 14:09,60,"E-Prescribe",89742,"NPH INSULIN HUMAN RECOMB 100 UNIT/ML SC SUSP","10 mL","2",04/18/2013 00:00,09/14/2013 00:00,09/14/2013 22:05,,"",314742,"",6,"Routine",18,"Subcutaneous",09/14/2013 22:05,417486280,"200009","2 TIMES DAILY WITH MEALS","BID with Meals","10",5,"Units",2,"Sent",314742,396419,10,,5,"Units",2000253,"C3","1","REORDERED",1,"Active Medication","8/10/2013","insulin NPH 100 unit/mL injection",2,2,"Sent",1,"Outpatient",10,1,"mL",,,,,,"",,,,"",,,,"","",10,,5,"Units",10,,5,"Units" """ inFile = StringIO(inFileStr) reader = TabDictReader(inFile, delimiter=",") parsedData = list(reader) # Convert to an in-memory list expectedData = \ [ {"order_med_id":"4880261", "DISPLAY_NAME":"docusate (COLACE) capsule 250 mg"}, {"order_med_id":"5226027", "DISPLAY_NAME":"warfarin (COUMADIN) tablet 5 mg \"Pharmacy Protocol\""}, {"order_med_id":"3579366", "DISPLAY_NAME":""}, ] targetKeys = list(expectedData[0].keys()) # Check a subset of values for simplicity self.assertEqualDictList(expectedData, parsedData, targetKeys)
def assertEqualRecommendedDataStatsTextOutput(self, expectedData, textOutput, headers): """Run assertEqualGeneral on the key components of the contents of the recommendation data. In this case, we do want to verify actual score / stat values match """ recommendedData = list(); for dataRow in TabDictReader(textOutput): for key,value in dataRow.items(): if key in headers: dataRow[key] = float(value); # Parse into numerical values for comparison recommendedData.append(dataRow); self.assertEqualRecommendedDataStats( expectedData, recommendedData, headers );
def parsePatientFile(patientFile, colNames): log.info("Parse patient file"); patientFile = stdOpen("patients.tab"); patientById = dict(); for patient in TabDictReader(patientFile): patientId = int(patient["patient_id"]); patient["patient_id"] = patientId; patientById[patientId] = patient; colNames.extend(["patient_id","dialysis","surgery"]); return patientById;
def __call__(self, scoreFile, outcomeFile, linkCol, outcomeLabel, valueMin, valueMax, generateHeader=False): """Return generator over dictionary objects representing the same contents as the scoreFile, but with added entry / column to reflect the outcome information in the outcomeFile, named outcomeLabel. Link the two together based on a common linkCol. Look for values in the outcomeFile, any whose value column is within [valueMin,valueMax] will be labeled with a positive outcome of +1, all else labeled 0. """ scoreReader = TabDictReader(scoreFile) outcomeReader = TabDictReader(outcomeFile) # Find positive outcome labels by presence in outcome reader. # If multiple exist for a link item ID will count any being positive as overall positive. outcomeByLinkId = dict() for outcomeDict in outcomeReader: linkId = outcomeDict[linkCol] value = float(outcomeDict["value"]) if valueMin <= value and value <= valueMax: outcomeByLinkId[linkId] = OUTCOME_PRESENT # Now copy through core score data, but adding outcome column for scoreDict in scoreReader: linkId = scoreDict[linkCol] scoreDict[outcomeLabel] = OUTCOME_ABSENT if linkId in outcomeByLinkId: scoreDict[outcomeLabel] = OUTCOME_PRESENT if generateHeader: headerDict = RowItemModel(list(scoreDict.keys()), list(scoreDict.keys())) yield headerDict generateHeader = False # Only need first row yield scoreDict
def main_formatResults(argv): ifs = stdOpen(BASE_RESULT_DIR + FILTERED_FILENAME) ofs = stdOpen(BASE_RESULT_DIR + FORMATTED_FILENAME, "w") summaryData = { "argv": argv } print >> ofs, COMMENT_TAG, json.dumps(summaryData) outputCols = [ "SortType", "TopicCount", "TrainTime", "VerifyTime", "precision", "recall", "normalprecision", "weightrecall", "roc_auc" ] formatter = TextResultsFormatter(ofs) formatter.formatTuple(outputCols) # Output header row reader = TabDictReader(ifs) for row in reader: row["SortType"] = row["_s"] # Extract out numerical data from filename text parameters row["TopicCount"] = None row["TrainTime"] = None if row["_m"] != 'None': # Expecting model name strings of the form: "models/topicModel.firstItems.q14400.v14400.2013.1234567890.filter.bow.gz.16Topic.model" chunks = row["_m"].split(".") topicChunk = chunks[-2] # Expect second to last period-delimited chunk to contain topic count topicChunk = topicChunk[:topicChunk.find("Topic")] # Remove trailing Topic text row["TopicCount"] = int(topicChunk) for chunk in chunks: if chunk[0] == "q" and chunk[-1].isdigit( ): # This should be the query time in seconds queryTimeSeconds = int(chunk[1:]) queryTimeMinutes = queryTimeSeconds / 60 row["TrainTime"] = queryTimeMinutes # Expecting training file name argument of the form: "sourceData/first24hourOrderSets.2013.q86400.v14400.-12345.tab.gz" row["VerifyTime"] = None for chunk in row["args_0_"].split("."): if chunk[0] == "v" and chunk[-1].isdigit( ): # This should be the verify time in seconds verifyTimeSeconds = int(chunk[1:]) verifyTimeMinutes = verifyTimeSeconds / 60 row["VerifyTime"] = verifyTimeMinutes formatter.formatResultDict(row, outputCols) ifs.close() ofs.close()
def assertExpectedTopItems(self, expectedDocCountByWordId, model, topTopicFile): # With randomized optimization algorithm, cannot depend on stable # Test results with each run. Instead make sure internally consistent, # and that raw count data is consistent # Values from model topic parameters scoreByItemIdByTopicId = dict() for (topicId, topicItems) in self.instance.enumerateTopics( model, ITEMS_PER_TOPIC): scoreByItemIdByTopicId[topicId] = dict() for (itemId, score) in topicItems: scoreByItemIdByTopicId[topicId][itemId] = score # Add expected word document counts under the "None" topic scoreByItemIdByTopicId[None] = expectedDocCountByWordId # Verify Top Topic Files match topScoreByItemIdByTopicId = dict() itemsChecked = 0 reader = TabDictReader(topTopicFile) for topicItem in reader: topicId = None if topicItem["topic_id"] != NULL_STRING: topicId = int(topicItem["topic_id"]) itemId = None if topicItem["item_id"] != NULL_STRING: itemId = int(topicItem["item_id"]) score = float(topicItem["score"]) tfidf = float(topicItem["tfidf"]) expectedTFIDF = 0.0 if itemId in expectedDocCountByWordId and expectedDocCountByWordId[ itemId] > 0: expectedTFIDF = score * expectedDocCountByWordId[ None] / expectedDocCountByWordId[itemId] #print >> sys.stderr, topicId, itemId, score, tfidf, expectedDocCountByWordId[itemId] self.assertAlmostEqual(expectedTFIDF, tfidf, places=5) if topicId not in topScoreByItemIdByTopicId: topScoreByItemIdByTopicId[topicId] = dict() topScoreByItemIdByTopicId[topicId][itemId] = score itemsChecked += 1 self.assertTrue(itemsChecked > 0) # Make sure an actual test happened for topicId, topScoreByItemId in topScoreByItemIdByTopicId.items(): scoreByItemId = scoreByItemIdByTopicId[topicId] self.assertAlmostEqualsDict(topScoreByItemId, scoreByItemId, places=5)
def parseClinicalItemFile(itemFile, patientIdCol="patient_id", timeCol="item_date"): prog = ProgressDots(); itemTimesByPatientId = dict(); for itemData in TabDictReader(itemFile): patientId = int(itemData[patientIdCol]); itemTime = DBUtil.parseDateValue(itemData[timeCol]); itemData[patientIdCol] = patientId; itemData[timeCol] = itemTime; if patientId not in itemTimesByPatientId: itemTimesByPatientId[patientId] = list(); itemTimesByPatientId[patientId].append( itemTime ); prog.update(); prog.printStatus(); return itemTimesByPatientId;
def __call__(self, inputFiles): """Return generator over dictionary objects representing the concatenated contents of the input files after adding and accounting for argv parameter columns. """ # Consolidate a master set of all column headers to use self.colNames = list() # Keep track of consistent order found colSet = set() # Keep track of unique items # Pull out any header comments that may represent an argv list to parse # Pull out header row with column labels for each input file argvDicts = list() # Dictionary for each input file keyed by argv parameter name with respective value readers = list() # TabDictReader from which header columns can be accessed as fieldnames for inputFile in inputFiles: reader = TabDictReader(inputFile) readers.append(reader) for col in reader.fieldnames: if col not in colSet: colSet.add(col) self.colNames.append(col) argvDict = self.extract_argvDict(reader.commentLines) # Must be called after reader.fieldnames so initial text parsing will start argvDicts.append(argvDict) for col in argvDict.iterkeys(): if col not in colSet: colSet.add(col) self.colNames.append(col) prog = ProgressDots(50, 1, "Files") # Now generate each file in succession, but "outer-joined" to include the master column header list for argvDict, reader in zip(argvDicts, readers): for resultDict in reader: resultDict.update(argvDict) for col in self.colNames: if col not in resultDict: resultDict[col] = None yield resultDict prog.update()
'HAPTO': 'HAPTOGLOBIN(HAP)', 'MCV': 'MCV(MCV)', 'RETICAB': 'RETIC, ABS(RETABS)', 'HGB': 'HGB(CALC), ISTAT', 'YSTFRR': 'SOL TRANSFERR REC', 'TRFSAT': 'TRANSFERRIN SAT', 'FE': 'IRON, TOTAL' } timer = time.time() featureMatrixFile = stdOpen("featureMatrix.tab") log.info("Parse feature matrix file") patientById = dict() for patient in TabDictReader(featureMatrixFile): patientId = int(patient["patient_id"]) patient["patient_id"] = patientId for labBaseName in labBaseNames: if patient[labBaseName] == NULL_STRING: patient[labBaseName] = None else: patient[labBaseName] = float(patient[labBaseName]) patientById[patientId] = patient log.info("Create plots against each metric against the index lab") for labBaseName in labBaseNames: # Construct independent (x) and dependent (y) vectors from available feature data yList = list() xList = list() for patient in patientById.values():
"""Given 2D Table of values, spit out "melted" long-relational form to feed into antibiogramData.js""" import sys, os from medinfo.common.Const import NULL_STRING from medinfo.common.Util import stdOpen from medinfo.db.ResultsFormatter import TabDictReader, TextResultsFormatter ifs = stdOpen(sys.argv[1]) # Input tab delimited file ofs = stdOpen(sys.argv[2], "w") # "-" for stdout reader = TabDictReader(ifs) formatter = TextResultsFormatter(ofs) for row in reader: bug = row["Bug"] for key in reader.fieldnames: value = row[key] if key != "Bug" and value and value != NULL_STRING: formatter.formatTuple([value, bug, key])
def __call__(self, inputFile, labelCols, valueCols, matchCols, baseLabels=None): prog = ProgressDots() self.labelCols = labelCols self.valueCols = valueCols self.matchCols = matchCols self.baseLabels = baseLabels labelModelByLabelKey = dict() dataByLabelKey = dict() reader = TabDictReader(inputFile) for rowModel in reader: labelKey = list() labelModel = dict() for labelCol in self.labelCols: labelModel[labelCol] = rowModel[labelCol] labelKey.append(rowModel[labelCol]) labelKey = tuple(labelKey) # Change to immutable object that can be hashed # Copy just items of interest valueModel = {} if self.matchCols: for matchCol in self.matchCols: valueModel[matchCol] = rowModel[matchCol] for valueCol in self.valueCols: try: valueModel[valueCol] = float(rowModel[valueCol]) except ValueError: # Maybe None string, could not parse into a number valueModel[valueCol] = None if labelKey not in dataByLabelKey: labelModelByLabelKey[labelKey] = labelModel dataByLabelKey[labelKey] = list() dataByLabelKey[labelKey].append(valueModel) prog.update() # prog.printStatus(); # Another pass to ensure data is consistently sorted in each group to allow later paired t-tests if self.matchCols: for labelKey, data in dataByLabelKey.iteritems(): data.sort(RowItemFieldComparator(self.matchCols)) # See if looking for only one set of base labeled data to compare the rest against baseLabelKey = None if self.baseLabels is not None: baseLabelKey = tuple(self.baseLabels) # Result pass to compare all group pair-wise combinations prog = ProgressDots() for labelKey0, data0 in dataByLabelKey.iteritems(): prefix0 = "Group0." labelModel0 = labelModelByLabelKey[labelKey0] if baseLabelKey is not None and labelKey0 != baseLabelKey: continue # Skip entries where the base label does not match specified key for labelKey1, data1 in dataByLabelKey.iteritems(): prefix1 = "Group1." labelModel1 = labelModelByLabelKey[labelKey1] result = dict() for labelCol in self.labelCols: result[prefix0 + labelCol] = labelModel0[labelCol] result[prefix1 + labelCol] = labelModel1[labelCol] for valueCol in self.valueCols: # Pull out value column for each data group. Previous, sort by match col to allow paired t-testing # Skip any value pairs if non-numeric / None value values0 = list() values1 = list() for dataItem0, dataItem1 in zip(data0, data1): if dataItem0[valueCol] is not None and dataItem1[ valueCol] is not None: values0.append(dataItem0[valueCol]) values1.append(dataItem1[valueCol]) for summaryFunction in SUMMARY_FUNCTIONS: result[prefix0 + valueCol + "." + summaryFunction.__name__] = summaryFunction( values0) result[prefix1 + valueCol + "." + summaryFunction.__name__] = summaryFunction( values1) for compTest in COMPARISON_TESTS: (t, p) = compTest(values0, values1) if np.isnan(p): p = None # Use more generic expression for NaN / null value result[compTest.__name__ + "." + valueCol] = p yield result prog.update()