def test_numRecsByOrderSet(self):
        # Designate number of recommendations indirectly via linked order set id 

        DBUtil.execute("update clinical_item set default_recommend = 0 where clinical_item_id = -8");   # Disable default recommend on one item to shift results

        colNames = ["patient_id", "TP", "FN", "FP",  "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"];
        expectedResults = [ RowItemModel([-11111, 2, 0, 3, 1.0, 0.4, 0.571,  1.0, 0.3178, 0.4167], colNames ) ];

        # Do through fabricated prepared file intermediary
        sys.stdout = StringIO();    
        argv = ["PreparePatientItems.py","-q","2","-v","3",'0,-11111',"-"];
        self.preparer.main(argv);
        preparedDataFile = StringIO(sys.stdout.getvalue());
        
        # Artificially add a key order set ID for the fabricated data
        modFile = StringIO();
        formatter = TextResultsFormatter(modFile);
        dataCols = None;
        for i, dataRow in enumerate(TabDictReader(preparedDataFile)):
            dataRow["order_set_id"] = TEST_ORDERSET_ID;
            if i <= 0:
                dataCols = list(dataRow.keys());
                formatter.formatTuple(dataCols);    # Insert a mock record to get a header / label row
            formatter.formatResultDict(dataRow, dataCols);
        preparedDataFile = StringIO(modFile.getvalue());

        sys.stdin = preparedDataFile;   # Read prepared data file from redirected stdin
        sys.stdout = StringIO();
        #argv = ["RecommendationClassificationAnalysis.py","-P","-r","5","-m","0","-R","ItemAssociationRecommender",'-',"-"];
        argv = ["RecommendationClassificationAnalysis.py","-P","--numRecsByOrderSet","-m","0","-R","ItemAssociationRecommender",'-',"-"];
        self.analyzer.main(argv);
        textOutput = StringIO(sys.stdout.getvalue());
        self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames);
Ejemplo n.º 2
0
def main_formatMergedTTests(argv):
    ifs = stdOpen(BASE_RESULT_DIR+CONCATENATE_FILENAME);
    ofs = stdOpen(BASE_RESULT_DIR+FILTERED_FILENAME, "w");

    summaryData = {"argv": argv};
    print >> ofs, COMMENT_TAG, json.dumps(summaryData);

    outputCols = ["SortType","TopicCount","VerifyTime","Group1.precision.mean","Group1.recall.mean","Group1.normalprecision.mean","Group1.weightrecall.mean","Group1.roc_auc.mean","ttest_rel.precision","ttest_rel.recall","ttest_rel.weightrecall","ttest_rel.roc_auc","Group1.numqueryitems.mean","Group1.numverifyitems.mean","Group1.numrecommendeditems.mean","Group1.tp.mean"];
    formatter = TextResultsFormatter(ofs);
    formatter.formatTuple(outputCols);  # Output header row

    reader = TabDictReader(ifs);
    for row in reader:
        row["SortType"] = row["Group1._s"];

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None;
        if row["Group1._m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.first24hourItems.2013.1234567890.filter.bow.gz.64Topic.model"
            topicChunk = row["Group1._m"].split(".")[-2];   # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]; # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk);

        # Expecting result file name argument of the form: "results/byOrderSets/01minutes/filteredResults.tab.gz"
        timeChunk = row["args[0]"].split("/")[-2];
        timeChunk = timeChunk[:timeChunk.find("minutes")];
        row["VerifyTime"] = int(timeChunk);

        formatter.formatResultDict(row, outputCols);

    ifs.close();
    ofs.close();
Ejemplo n.º 3
0
def parseLabResultsFile(labFile):
    log.info("Parse lab results file");
    prog = ProgressDots();
    labsByBaseNameByPatientId = dict(); # Dictionary of dictionaries of lists of result items
    for labResult in TabDictReader(labFile):
        if labResult["ord_num_value"] is not None and labResult["ord_num_value"] != NULL_STRING:
            patientId = int(labResult["pat_id"]);
            labBaseName = labResult["base_name"];
            resultValue = float(labResult["ord_num_value"]);
            resultTime = DBUtil.parseDateValue(labResult["result_time"]);

            if resultValue < LAB_SENTINEL_VALUE:    # Skip apparent placeholder values
                labResult["pat_id"] = labResult["patient_id"] = patientId;
                labResult["ord_num_value"] = resultValue;
                labResult["result_time"] = resultTime;

                if patientId not in labsByBaseNameByPatientId:
                    labsByBaseNameByPatientId[patientId] = dict();
                if labBaseName not in labsByBaseNameByPatientId[patientId]:
                    labsByBaseNameByPatientId[patientId][labBaseName] = list();
                labsByBaseNameByPatientId[patientId][labBaseName].append( labResult );

        prog.update();
    prog.printStatus();
    return labsByBaseNameByPatientId;
Ejemplo n.º 4
0
 def loadDocCountByWordId(self, filename):
     """Given the name of a top topics file,
     load the section reporting the overall word document counts
     """
     docCountByWordId = dict()
     reader = TabDictReader(stdOpen(filename))
     for topicItem in reader:
         if topicItem[
                 "topic_id"] == NULL_STRING:  # All document section, not topic specific
             itemId = None
             if topicItem["item_id"] != NULL_STRING:
                 itemId = int(topicItem["item_id"])
             docCount = int(topicItem["score"])
             docCountByWordId[itemId] = docCount
     reader.close()
     return docCountByWordId
Ejemplo n.º 5
0
    def parseScoreModelsFromFile(self,
                                 inputFile,
                                 colOutcome=None,
                                 scoreCols=None):
        """Structured variant of above.  Assume named columns and just return combined dictionary / RowItemModels
        """
        scoreModels = list()
        for scoreModel in TabDictReader(inputFile):
            # Data parsing for any named columns
            if colOutcome is not None:
                outcome = OUTCOME_PRESENT
                if scoreModel[colOutcome] in NEGATIVE_OUTCOME_STRS:
                    outcome = OUTCOME_ABSENT
                scoreModel[colOutcome] = outcome

            # Temporary hack to get P-Fisher-NegLog into dataset
            import math
            if scoreCols is not None and "P-Fisher-NegLog" in scoreCols:
                p = float(scoreModel["P-Fisher"])
                logP = -sys.float_info.max
                if p > 0.0:
                    logP = math.log(p, 10)
                if scoreModel["OR"] > 1.0:
                    logP *= -1
                scoreModel["P-Fisher-NegLog"] = logP

            if scoreCols is not None:
                for colScore in scoreCols:
                    scoreModel[colScore] = float(scoreModel[colScore])

            scoreModels.append(scoreModel)
        return scoreModels
Ejemplo n.º 6
0
    def test_merge(self):
        # Simulate data files
        inputFiles = \
            [   # JSON style header
                StringIO \
                ("""# {"argv": ["medinfo\\\\Score.py", "-R", "ItemAssociationRecommender", "-c", "135", "-Q", "14400", "-V", "86400", "-r", "10", "-a", "unweighted", "177340", "test.out"]}
id\toutcome\tscore
1\t0\t0.01
2\t0\t0.02
3\t1\t0.13
"""),
                # Simple list style header
                StringIO \
                ("""# ["medinfo\\\\Score.py", "-R", "ItemAssociationRecommender", "-c", "135", "-Q", "14400", "-V", "86400", "-r", "10", "-a", "weighted", "-s", "PPV", "177340", "test2.out"]
id\toutcome\tscore2
1\t0\t0.15
3\t1\t0.31
4\t1\t0.23
"""),
                # Extra comment
                StringIO \
                ("""# Generic extra comment + True/False option
# ["medinfo\\\\Score.py", "-X", "-R", "ItemAssociationRecommender", "-c", "135", "-Q", "14400", "-V", "86400", "-r", "10", "-a", "weighted", "-s", "prevalence", "141", "test3.out"]
id\toutcome\tscore2
5\t2\t0.15
1\t0\t0.22
3\t1\t0.42
"""),
                # No header comment
                StringIO \
                ("""id\toutcome\tscore
5\t2\t0.25
6\t3\t0.52
4\t1\t0.82
"""),
            ]

        # Call application
        keyCol = ["id", "outcome"]
        # Key columns to expect to be the same
        suffixList = [".A", ".B", ".C", ".D"]
        # Force suffixes to be added to all other columns
        colNames = [
            "id", "outcome", "score.A", "score2.B", "score2.C", "score.D"
        ]
        outFile = StringIO()
        self.analyzer(inputFiles, keyCol, suffixList, outFile)

        # Read output back into structured objects, and validate matches expected
        testResults = list(TabDictReader(StringIO(outFile.getvalue())))
        expectedResults = \
            [   dict( zip(colNames, ["1","0","0.01","0.15","0.22", "nan"]) ),
                dict( zip(colNames, ["2","0","0.02", "nan", "nan", "nan"]) ),
                dict( zip(colNames, ["3","1","0.13","0.31","0.42", "nan"]) ),
                dict( zip(colNames, ["4","1", "nan","0.23", "nan","0.82"]) ),
                dict( zip(colNames, ["5","2", "nan", "nan","0.15","0.25"]) ),
                dict( zip(colNames, ["6","3", "nan", "nan", "nan","0.52"]) ),
            ]
        self.assertEqualList(expectedResults, testResults)
Ejemplo n.º 7
0
    def test_TabDictReader(self):
        """Verify expected results when reading from different delimited file examples,
        particularly cases of messed up quoting or internal delimiter characters."""

        inFileStr = \
            """# Test comment line
            order_proc_id,"pat_id",pat_enc_csn_id,ordering_date,"order_type",proc_id,"proc_code","description","display_name","cpt_code","proc_cat_name","order_class","authrzing_prov_id","abnormal_yn","lab_status","order_status",quantity,"future_or_stand",standing_exp_date,standing_occurs,stand_orig_occur,"radiology_status",proc_bgn_time,proc_end_time,order_inst,"stand_interval","discrete_interval",instantiated_time,order_time,result_time,proc_start_time,problem_list_id,proc_ending_time,chng_order_proc_id,last_stand_perf_dt,last_stand_perf_tm,parent_ce_order_id,"ordering_mode"
            3488535,"7229924684871",444976,10/12/2009 00:00,"Nursing",472897,"NUR1018","MONITOR INTAKE AND OUTPUT","Monitor Intake And Output","NUR1018","NURSING - ASSESSMENT","Hospital Performed","376355","","","Sent",1,"",,,,"",,,10/12/2009 00:17,"","",10/12/2009 00:17,10/12/2009 00:17,,10/12/2009 04:00,,10/12/2009 00:00,,,,,"Inpatient"
            4530091,"11715476458129",417026,11/19/2009 00:00,"Nursing",498171,"NUR1940","NURSING COMMUNICATION","Give patient "Bedside Insulin Pump Flow Sheet" to document insulin delivery, BG and carbohydrates","NUR1940","NURSING - TREATMENT","Hospital Performed","355432","","","",1,"S",,,1,"",,,11/19/2009 11:55,"CONTINUOUS","",,11/19/2009 11:55,,11/19/2009 12:00,,,,11/19/2009 00:00,,,"Inpatient"
            5905631,"10720939760322",387975,01/16/2010 00:00,"Nursing",473324,"NUR1182","SIGN ABOVE BED 'DO NOT REPOSITION NG'","Sign above bed"Do not reposition NG"","NUR1182","NURSING - DRAINS AND TUBES","Hospital Performed","314969","","","Canceled",1,"S",,,1,"",,,01/16/2010 18:31,"CONTINUOUS","",,01/16/2010 18:31,,01/16/2010 18:45,,,,01/16/2010 00:00,,,"Inpatient"
            """
        inFile = StringIO(inFileStr)
        reader = TabDictReader(inFile, delimiter=",")
        parsedData = list(reader)
        # Convert to an in-memory list
        expectedData = \
            [   {"order_proc_id":"3488535", "display_name":"Monitor Intake And Output"},
                {"order_proc_id":"4530091", "display_name":"Give patient \"Bedside Insulin Pump Flow Sheet\" to document insulin delivery, BG and carbohydrates"},
                {"order_proc_id":"5905631", "display_name":"Sign above bed\"Do not reposition NG\""},
            ]
        targetKeys = list(expectedData[0].keys())
        # Check a subset of values for simplicity
        self.assertEqualDictList(expectedData, parsedData, targetKeys)

        # Another test on messed up order_med end double quote
        inFileStr = \
            """order_med_id,pat_id,pat_enc_csn_id,ordering_date,ORDER_CLASS_C,order_class_name,MEDICATION_ID,description,QUANTITY,REFILLS,start_taking_time,order_end_time,end_taking_time,RSN_FOR_DISCON_C,rsn_for_discon,MED_PRESC_PROV_ID,DISPLAY_NAME,ORDER_PRIORITY_C,order_priority,MED_ROUTE_C,med_route,discon_time,CHNG_ORDER_MED_ID,HV_DISCR_FREQ_ID,freq_name,discrete_frequency,HV_DISCRETE_DOSE,HV_DOSE_UNIT_C,hv_dose_unit,ORDER_STATUS_C,order_status,AUTHRZING_PROV_ID,ORD_PROV_ID,MIN_DISCRETE_DOSE,MAX_DISCRETE_DOSE,DOSE_UNIT_C,dose_unit,PAT_LOC_ID,department_name,MODIFY_TRACK_C,modify_track,ACT_ORDER_C,active_order,LASTDOSE,AMB_MED_DISP_NAME,REFILLS_REMAINING,RESUME_STATUS_C,resume_status,ORDERING_MODE_C,ordering_mode,MED_DIS_DISP_QTY,MED_DIS_DISP_UNIT_C,dispense_unit,number_of_doses,doses_remaining,min_rate,max_rate,rate_unit_c,rate_unit,min_duration,max_duration,med_duration_unit_c,duration_unit_name,min_volume,max_volume,volume_unit_c,volume_unit,calc_volume_yn,calc_min_dose,calc_max_dose,calc_dose_unit_c,calc_dose_unit,admin_min_dose,admin_max_dose,admin_dose_unit_c,admin_dose_unit
            4880261,-11815067487752,418519,09/16/2009 11:49,8,"Fax",2567,"DOCUSATE SODIUM 50 MG/5 ML PO LIQD","","",09/16/2009 12:00,09/17/2009 21:26,09/18/2009 04:26,,"",360247,"docusate (COLACE) capsule 250 mg",,"",15,"Oral",09/18/2009 04:26,,"200006","2 TIMES DAILY","BID","250",3,"mg",9,"Discontinued",360247,377971,250,,3,"mg",2000262,"E2-ICU","","",3,"Discontinued Medication","","",,,"",2,"Inpatient",,,"",,,,,,"",,,,"",,,,"","Y",250,,3,"mg",1,,5003,"Cap"
            5226027,-5331130233402,455118,06/26/2011 07:56,1,"Normal",8751,"WARFARIN 5 MG PO TABS","","",06/26/2011 18:00,06/27/2011 07:44,06/27/2011 14:44,,"",345517,"warfarin (COUMADIN) tablet 5 mg "Pharmacy Protocol"",,"",15,"Oral",06/27/2011 14:44,385146643,"200023","DAILY","QPM","5",3,"mg",9,"Discontinued",345517,372188,5,,3,"mg",2000273,"D3","2","MODIFIED",3,"Discontinued Medication","","",,,"",2,"Inpatient",,,"",,,,,,"",,,,"",,,,"","",5,,3,"mg",1,,5002,"Tab"
            3579366,4662062643677,385175,04/18/2013 14:09,60,"E-Prescribe",89742,"NPH INSULIN HUMAN RECOMB 100 UNIT/ML SC SUSP","10 mL","2",04/18/2013 00:00,09/14/2013 00:00,09/14/2013 22:05,,"",314742,"",6,"Routine",18,"Subcutaneous",09/14/2013 22:05,417486280,"200009","2 TIMES DAILY WITH MEALS","BID with Meals","10",5,"Units",2,"Sent",314742,396419,10,,5,"Units",2000253,"C3","1","REORDERED",1,"Active Medication","8/10/2013","insulin NPH 100 unit/mL injection",2,2,"Sent",1,"Outpatient",10,1,"mL",,,,,,"",,,,"",,,,"","",10,,5,"Units",10,,5,"Units"
            """
        inFile = StringIO(inFileStr)
        reader = TabDictReader(inFile, delimiter=",")
        parsedData = list(reader)
        # Convert to an in-memory list
        expectedData = \
            [   {"order_med_id":"4880261", "DISPLAY_NAME":"docusate (COLACE) capsule 250 mg"},
                {"order_med_id":"5226027", "DISPLAY_NAME":"warfarin (COUMADIN) tablet 5 mg \"Pharmacy Protocol\""},
                {"order_med_id":"3579366", "DISPLAY_NAME":""},
            ]
        targetKeys = list(expectedData[0].keys())
        # Check a subset of values for simplicity
        self.assertEqualDictList(expectedData, parsedData, targetKeys)
Ejemplo n.º 8
0
 def assertEqualRecommendedDataStatsTextOutput(self, expectedData, textOutput, headers):
     """Run assertEqualGeneral on the key components of the contents of the recommendation data.
     In this case, we do want to verify actual score / stat values match
     """
     recommendedData = list();
     for dataRow in TabDictReader(textOutput):
         for key,value in dataRow.items():
             if key in headers:
                 dataRow[key] = float(value);    # Parse into numerical values for comparison
         recommendedData.append(dataRow);
     self.assertEqualRecommendedDataStats( expectedData, recommendedData, headers );
Ejemplo n.º 9
0
def parsePatientFile(patientFile, colNames):
    log.info("Parse patient file");
    patientFile = stdOpen("patients.tab");
    patientById = dict();
    for patient in TabDictReader(patientFile):
        patientId = int(patient["patient_id"]);
        patient["patient_id"] = patientId;
        patientById[patientId] = patient;

    colNames.extend(["patient_id","dialysis","surgery"]);
    return patientById;    
Ejemplo n.º 10
0
    def __call__(self,
                 scoreFile,
                 outcomeFile,
                 linkCol,
                 outcomeLabel,
                 valueMin,
                 valueMax,
                 generateHeader=False):
        """Return generator over dictionary objects representing
        the same contents as the scoreFile, but with added entry / column
        to reflect the outcome information in the outcomeFile, named outcomeLabel.
        Link the two together based on a common linkCol.
        Look for values in the outcomeFile, any whose value column is within [valueMin,valueMax]
        will be labeled with a positive outcome of +1, all else labeled 0.
        """
        scoreReader = TabDictReader(scoreFile)
        outcomeReader = TabDictReader(outcomeFile)

        # Find positive outcome labels by presence in outcome reader.
        #   If multiple exist for a link item ID will count any being positive as overall positive.
        outcomeByLinkId = dict()
        for outcomeDict in outcomeReader:
            linkId = outcomeDict[linkCol]
            value = float(outcomeDict["value"])
            if valueMin <= value and value <= valueMax:
                outcomeByLinkId[linkId] = OUTCOME_PRESENT

        # Now copy through core score data, but adding outcome column
        for scoreDict in scoreReader:
            linkId = scoreDict[linkCol]
            scoreDict[outcomeLabel] = OUTCOME_ABSENT
            if linkId in outcomeByLinkId:
                scoreDict[outcomeLabel] = OUTCOME_PRESENT

            if generateHeader:
                headerDict = RowItemModel(list(scoreDict.keys()),
                                          list(scoreDict.keys()))
                yield headerDict
                generateHeader = False
                # Only need first row
            yield scoreDict
def main_formatResults(argv):
    ifs = stdOpen(BASE_RESULT_DIR + FILTERED_FILENAME)
    ofs = stdOpen(BASE_RESULT_DIR + FORMATTED_FILENAME, "w")

    summaryData = {
        "argv": argv
    }
    print >> ofs, COMMENT_TAG, json.dumps(summaryData)

    outputCols = [
        "SortType", "TopicCount", "TrainTime", "VerifyTime", "precision",
        "recall", "normalprecision", "weightrecall", "roc_auc"
    ]
    formatter = TextResultsFormatter(ofs)
    formatter.formatTuple(outputCols)
    # Output header row

    reader = TabDictReader(ifs)
    for row in reader:
        row["SortType"] = row["_s"]

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None
        row["TrainTime"] = None
        if row["_m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.firstItems.q14400.v14400.2013.1234567890.filter.bow.gz.16Topic.model"
            chunks = row["_m"].split(".")
            topicChunk = chunks[-2]
            # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]
            # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk)

            for chunk in chunks:
                if chunk[0] == "q" and chunk[-1].isdigit(
                ):  # This should be the query time in seconds
                    queryTimeSeconds = int(chunk[1:])
                    queryTimeMinutes = queryTimeSeconds / 60
                    row["TrainTime"] = queryTimeMinutes

        # Expecting training file name argument of the form: "sourceData/first24hourOrderSets.2013.q86400.v14400.-12345.tab.gz"
        row["VerifyTime"] = None
        for chunk in row["args_0_"].split("."):
            if chunk[0] == "v" and chunk[-1].isdigit(
            ):  # This should be the verify time in seconds
                verifyTimeSeconds = int(chunk[1:])
                verifyTimeMinutes = verifyTimeSeconds / 60
                row["VerifyTime"] = verifyTimeMinutes

        formatter.formatResultDict(row, outputCols)

    ifs.close()
    ofs.close()
Ejemplo n.º 12
0
    def assertExpectedTopItems(self, expectedDocCountByWordId, model,
                               topTopicFile):
        # With randomized optimization algorithm, cannot depend on stable
        # Test results with each run.  Instead make sure internally consistent,
        #   and that raw count data is consistent

        # Values from model topic parameters
        scoreByItemIdByTopicId = dict()
        for (topicId, topicItems) in self.instance.enumerateTopics(
                model, ITEMS_PER_TOPIC):
            scoreByItemIdByTopicId[topicId] = dict()
            for (itemId, score) in topicItems:
                scoreByItemIdByTopicId[topicId][itemId] = score
        # Add expected word document counts under the "None" topic
        scoreByItemIdByTopicId[None] = expectedDocCountByWordId

        # Verify Top Topic Files match
        topScoreByItemIdByTopicId = dict()
        itemsChecked = 0
        reader = TabDictReader(topTopicFile)
        for topicItem in reader:
            topicId = None
            if topicItem["topic_id"] != NULL_STRING:
                topicId = int(topicItem["topic_id"])
            itemId = None
            if topicItem["item_id"] != NULL_STRING:
                itemId = int(topicItem["item_id"])
            score = float(topicItem["score"])
            tfidf = float(topicItem["tfidf"])

            expectedTFIDF = 0.0
            if itemId in expectedDocCountByWordId and expectedDocCountByWordId[
                    itemId] > 0:
                expectedTFIDF = score * expectedDocCountByWordId[
                    None] / expectedDocCountByWordId[itemId]

            #print >> sys.stderr, topicId, itemId, score, tfidf, expectedDocCountByWordId[itemId]
            self.assertAlmostEqual(expectedTFIDF, tfidf, places=5)

            if topicId not in topScoreByItemIdByTopicId:
                topScoreByItemIdByTopicId[topicId] = dict()
            topScoreByItemIdByTopicId[topicId][itemId] = score
            itemsChecked += 1
        self.assertTrue(itemsChecked > 0)
        # Make sure an actual test happened

        for topicId, topScoreByItemId in topScoreByItemIdByTopicId.items():
            scoreByItemId = scoreByItemIdByTopicId[topicId]
            self.assertAlmostEqualsDict(topScoreByItemId,
                                        scoreByItemId,
                                        places=5)
Ejemplo n.º 13
0
def parseClinicalItemFile(itemFile, patientIdCol="patient_id", timeCol="item_date"):
    prog = ProgressDots();
    itemTimesByPatientId = dict();
    for itemData in TabDictReader(itemFile):
        patientId = int(itemData[patientIdCol]);
        itemTime = DBUtil.parseDateValue(itemData[timeCol]);

        itemData[patientIdCol] = patientId;
        itemData[timeCol] = itemTime;

        if patientId not in itemTimesByPatientId:
            itemTimesByPatientId[patientId] = list();
        itemTimesByPatientId[patientId].append( itemTime );

        prog.update();
    prog.printStatus();

    return itemTimesByPatientId;
Ejemplo n.º 14
0
    def __call__(self, inputFiles):
        """Return generator over dictionary objects representing
        the concatenated contents of the input files after adding and accounting for argv parameter columns.
        """
        # Consolidate a master set of all column headers to use
        self.colNames = list()
        # Keep track of consistent order found
        colSet = set()
        # Keep track of unique items

        # Pull out any header comments that may represent an argv list to parse
        # Pull out header row with column labels for each input file
        argvDicts = list()
        # Dictionary for each input file keyed by argv parameter name with respective value
        readers = list()
        # TabDictReader from which header columns can be accessed as fieldnames
        for inputFile in inputFiles:
            reader = TabDictReader(inputFile)
            readers.append(reader)
            for col in reader.fieldnames:
                if col not in colSet:
                    colSet.add(col)
                    self.colNames.append(col)

            argvDict = self.extract_argvDict(reader.commentLines)
            # Must be called after reader.fieldnames so initial text parsing will start
            argvDicts.append(argvDict)
            for col in argvDict.iterkeys():
                if col not in colSet:
                    colSet.add(col)
                    self.colNames.append(col)

        prog = ProgressDots(50, 1, "Files")

        # Now generate each file in succession, but "outer-joined" to include the master column header list
        for argvDict, reader in zip(argvDicts, readers):
            for resultDict in reader:
                resultDict.update(argvDict)
                for col in self.colNames:
                    if col not in resultDict:
                        resultDict[col] = None
                yield resultDict
            prog.update()
Ejemplo n.º 15
0
    'HAPTO': 'HAPTOGLOBIN(HAP)',
    'MCV': 'MCV(MCV)',
    'RETICAB': 'RETIC, ABS(RETABS)',
    'HGB': 'HGB(CALC), ISTAT',
    'YSTFRR': 'SOL TRANSFERR REC',
    'TRFSAT': 'TRANSFERRIN SAT',
    'FE': 'IRON, TOTAL'
}

timer = time.time()

featureMatrixFile = stdOpen("featureMatrix.tab")

log.info("Parse feature matrix file")
patientById = dict()
for patient in TabDictReader(featureMatrixFile):
    patientId = int(patient["patient_id"])
    patient["patient_id"] = patientId
    for labBaseName in labBaseNames:
        if patient[labBaseName] == NULL_STRING:
            patient[labBaseName] = None
        else:
            patient[labBaseName] = float(patient[labBaseName])
    patientById[patientId] = patient

log.info("Create plots against each metric against the index lab")
for labBaseName in labBaseNames:
    # Construct independent (x) and dependent (y) vectors from available feature data
    yList = list()
    xList = list()
    for patient in patientById.values():
Ejemplo n.º 16
0
"""Given 2D Table of values, spit out "melted" long-relational form to feed into antibiogramData.js"""

import sys, os
from medinfo.common.Const import NULL_STRING
from medinfo.common.Util import stdOpen
from medinfo.db.ResultsFormatter import TabDictReader, TextResultsFormatter

ifs = stdOpen(sys.argv[1])
# Input tab delimited file
ofs = stdOpen(sys.argv[2], "w")
# "-" for stdout

reader = TabDictReader(ifs)
formatter = TextResultsFormatter(ofs)
for row in reader:
    bug = row["Bug"]
    for key in reader.fieldnames:
        value = row[key]
        if key != "Bug" and value and value != NULL_STRING:
            formatter.formatTuple([value, bug, key])
Ejemplo n.º 17
0
    def __call__(self,
                 inputFile,
                 labelCols,
                 valueCols,
                 matchCols,
                 baseLabels=None):
        prog = ProgressDots()

        self.labelCols = labelCols
        self.valueCols = valueCols
        self.matchCols = matchCols
        self.baseLabels = baseLabels

        labelModelByLabelKey = dict()
        dataByLabelKey = dict()

        reader = TabDictReader(inputFile)
        for rowModel in reader:
            labelKey = list()
            labelModel = dict()
            for labelCol in self.labelCols:
                labelModel[labelCol] = rowModel[labelCol]
                labelKey.append(rowModel[labelCol])
            labelKey = tuple(labelKey)
            # Change to immutable object that can be hashed

            # Copy just items of interest
            valueModel = {}
            if self.matchCols:
                for matchCol in self.matchCols:
                    valueModel[matchCol] = rowModel[matchCol]
            for valueCol in self.valueCols:
                try:
                    valueModel[valueCol] = float(rowModel[valueCol])
                except ValueError:  # Maybe None string, could not parse into a number
                    valueModel[valueCol] = None

            if labelKey not in dataByLabelKey:
                labelModelByLabelKey[labelKey] = labelModel
                dataByLabelKey[labelKey] = list()
            dataByLabelKey[labelKey].append(valueModel)

            prog.update()

        # prog.printStatus();

        # Another pass to ensure data is consistently sorted in each group to allow later paired t-tests
        if self.matchCols:
            for labelKey, data in dataByLabelKey.iteritems():
                data.sort(RowItemFieldComparator(self.matchCols))

        # See if looking for only one set of base labeled data to compare the rest against
        baseLabelKey = None
        if self.baseLabels is not None:
            baseLabelKey = tuple(self.baseLabels)

        # Result pass to compare all group pair-wise combinations
        prog = ProgressDots()
        for labelKey0, data0 in dataByLabelKey.iteritems():
            prefix0 = "Group0."
            labelModel0 = labelModelByLabelKey[labelKey0]

            if baseLabelKey is not None and labelKey0 != baseLabelKey:
                continue
                # Skip entries where the base label does not match specified key

            for labelKey1, data1 in dataByLabelKey.iteritems():
                prefix1 = "Group1."
                labelModel1 = labelModelByLabelKey[labelKey1]

                result = dict()
                for labelCol in self.labelCols:
                    result[prefix0 + labelCol] = labelModel0[labelCol]
                    result[prefix1 + labelCol] = labelModel1[labelCol]

                for valueCol in self.valueCols:
                    # Pull out value column for each data group.  Previous, sort by match col to allow paired t-testing
                    # Skip any value pairs if non-numeric / None value
                    values0 = list()
                    values1 = list()
                    for dataItem0, dataItem1 in zip(data0, data1):
                        if dataItem0[valueCol] is not None and dataItem1[
                                valueCol] is not None:
                            values0.append(dataItem0[valueCol])
                            values1.append(dataItem1[valueCol])

                    for summaryFunction in SUMMARY_FUNCTIONS:
                        result[prefix0 + valueCol + "." +
                               summaryFunction.__name__] = summaryFunction(
                                   values0)
                        result[prefix1 + valueCol + "." +
                               summaryFunction.__name__] = summaryFunction(
                                   values1)

                    for compTest in COMPARISON_TESTS:
                        (t, p) = compTest(values0, values1)
                        if np.isnan(p):
                            p = None
                            # Use more generic expression for NaN / null value
                        result[compTest.__name__ + "." + valueCol] = p

                yield result

                prog.update()