def main_buildTopicModel(argv): bowInputFilename = SOURCE_DATA_DIR + INPUT_FILENAME mod = TopicModel() for numTopics in numTopicsOptions: subargv = ["TopicModel", "-n", str(numTopics)] subargv.extend([ bowInputFilename, MODEL_DIR + "/topicModel." + os.path.basename(bowInputFilename) + ".%dTopic.model" % (numTopics), ]) mod.main(subargv) return mod.model
def __init__(self, model, docCountByWordId=None): """Initialize module with prior generated model and word document counts from TopicModel module. """ BaseItemRecommender.__init__(self) self.modeler = TopicModel() # Utility instance to run off of if docCountByWordId: # Specified both options self.model = model self.docCountByWordId = docCountByWordId else: # If only the first one specified, interpret it as a base filename to load the objects from filename = model (self.model, self.docCountByWordId ) = self.modeler.loadModelAndDocCounts(filename) # Cached lookup data. Don't repeat work for serial queries self.itemsById = None self.categoryIdByItemId = None self.candidateItemIds = None self.weightByItemIdByTopicId = None
def main_quickTest(argv): modelFilename = argv[1] modeler = TopicModel() timer = time.time() (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename) timer = time.time() - timer log.info("%.2f seconds to load", timer) timer = time.time() weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId( model, 100) timer = time.time() - timer log.info("%.2f seconds to generate weights", timer) for i in xrange(3): prog = ProgressDots() for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems(): for (itemId, itemWeight) in weightByItemId.iteritems(): prog.update() prog.printStatus() """
class TopicModelRecommender(BaseItemRecommender): """Implementation class for item (e.g., order) recommendation based on topic models (LDA Latent Dirichlet Allocation or HDP Hierarchical Dirichlet Process). """ def __init__(self, model, docCountByWordId=None): """Initialize module with prior generated model and word document counts from TopicModel module. """ BaseItemRecommender.__init__(self) self.modeler = TopicModel() # Utility instance to run off of if docCountByWordId: # Specified both options self.model = model self.docCountByWordId = docCountByWordId else: # If only the first one specified, interpret it as a base filename to load the objects from filename = model (self.model, self.docCountByWordId ) = self.modeler.loadModelAndDocCounts(filename) # Cached lookup data. Don't repeat work for serial queries self.itemsById = None self.categoryIdByItemId = None self.candidateItemIds = None self.weightByItemIdByTopicId = None def initItemLookups(self, query): self.itemsById = DBUtil.loadTableAsDict("clinical_item") self.categoryIdByItemId = dict() for itemId, item in self.itemsById.iteritems(): self.categoryIdByItemId[itemId] = item["clinical_item_category_id"] self.candidateItemIds = set() emptyQuerySet = set() for itemId in self.docCountByWordId.keys(): if self.isItemRecommendable(itemId, emptyQuerySet, query, self.categoryIdByItemId): self.candidateItemIds.add(itemId) def __call__(self, query): # Given query items, use model to find related topics with relationship scores # Load item category lookup information if self.itemsById is None: self.initItemLookups(query) # Load model weight parameters once to save time on serial queries if self.weightByItemIdByTopicId is None: self.weightByItemIdByTopicId = self.modeler.generateWeightByItemIdByTopicId( self.model, query.itemsPerCluster) # Adapt query into bag-of-words format queryItemCountById = query.queryItemIds if not isinstance( queryItemCountById, dict ): # Not a dictionary, probably a one dimensional list/set, then just add counts of 1 itemIds = queryItemCountById queryItemCountById = dict() for itemId in itemIds: queryItemCountById[itemId] = 1 observedIds = set() queryBag = list( self.modeler.itemCountByIdToBagOfWords(queryItemCountById, observedIds, self.itemsById, query.excludeCategoryIds)) # Primary model execute. Apply to query to generate scored relationship to each "topic" topicWeights = self.model[queryBag] weightByTopicId = dict() for (topicId, topicWeight) in topicWeights: weightByTopicId[topicId] = topicWeight # Composite scores for (recommendable) items by taking weighted average across the top items for each topic recScoreByItemId = dict() for itemId in self.candidateItemIds: if self.isItemRecommendable(itemId, queryItemCountById, query, self.categoryIdByItemId): recScoreByItemId[itemId] = 0.0 for topicId, topicWeight in weightByTopicId.iteritems(): if topicWeight > query.minClusterWeight: # Ignore topics with tiny contribution weightByItemId = self.weightByItemIdByTopicId[topicId] for itemId in recScoreByItemId.keys(): itemWeight = 0.0 if itemId in weightByItemId: itemWeight = weightByItemId[itemId] recScoreByItemId[itemId] += topicWeight * itemWeight # Build 2-pls with lists to sort by score recommendedData = list() for itemId, totalItemWeight in recScoreByItemId.iteritems(): tfidf = 0.0 if itemId in self.docCountByWordId and self.docCountByWordId[ itemId] > 0.0: tfidf = totalItemWeight * self.docCountByWordId[ None] / self.docCountByWordId[itemId] # Scale TF*IDF score based on baseline document counts to prioritize disproportionately common items itemModel = \ { "totalItemWeight": totalItemWeight, "tf": totalItemWeight, "PPV": totalItemWeight, "P(item|query)": totalItemWeight, "P(B|A)": totalItemWeight, "tfidf": tfidf, "lift": tfidf, "interest": tfidf, "P(item|query)/P(item)": tfidf, "P(B|A)/P(B)": tfidf, "clinical_item_id": itemId, "weightByTopicId": weightByTopicId, "numSelectedTopics": len(weightByTopicId), # Duplicate for each item, but persist here to enable retrieve by caller } itemModel["score"] = itemModel[query.sortField] recommendedData.append(itemModel) recommendedData.sort(RowItemFieldComparator("score"), reverse=True) return recommendedData def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <queryStr> [<outputFile>]\n"+\ " <queryStr> Query string to specify what recommendation items to retrieve.\n"+\ " Refer to RecommenderQuery or HTML example code for elaboration of options\n"+\ " Expect formatting like a URL query string: queryItemIds=1,2&resultCount=10&sortField=conditionalFreq&filterField0=baselineFreq<0.01...\n"+\ " The sortField and filterFields will be used to determine what numerical / score columns to dislpay\n"+\ " <outputFile> Tab-delimited table of recommender results..\n"+\ " Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) (options, args) = parser.parse_args(argv[1:]) """
def setUp(self): """Prepare state for test cases""" DBTestCase.setUp(self) log.info("Populate the database with test data") from stride.clinical_item.ClinicalItemDataLoader import ClinicalItemDataLoader ClinicalItemDataLoader.build_clinical_item_psql_schemata() self.clinicalItemCategoryIdStrList = list() headers = ["clinical_item_category_id", "source_table"] dataModels = \ [ RowItemModel( [-1, "Labs"], headers ), RowItemModel( [-2, "Imaging"], headers ), RowItemModel( [-3, "Meds"], headers ), RowItemModel( [-4, "Nursing"], headers ), RowItemModel( [-5, "Problems"], headers ), RowItemModel( [-6, "Lab Results"], headers ), ] for dataModel in dataModels: (dataItemId, isNew) = DBUtil.findOrInsertItem("clinical_item_category", dataModel) self.clinicalItemCategoryIdStrList.append(str(dataItemId)) headers = [ "clinical_item_id", "clinical_item_category_id", "name", "analysis_status" ] dataModels = \ [ RowItemModel( [1, -1, "CBC",1], headers ), RowItemModel( [2, -1, "BMP",0], headers ), # Clear analysis status, so this will be ignored unless changed RowItemModel( [3, -1, "Hepatic Panel",1], headers ), RowItemModel( [4, -1, "Cardiac Enzymes",1], headers ), RowItemModel( [5, -2, "CXR",1], headers ), RowItemModel( [6, -2, "RUQ Ultrasound",1], headers ), RowItemModel( [7, -2, "CT Abdomen/Pelvis",1], headers ), RowItemModel( [8, -2, "CT PE Thorax",1], headers ), RowItemModel( [9, -3, "Acetaminophen",1], headers ), RowItemModel( [10, -3, "Carvedilol",1], headers ), RowItemModel( [11, -3, "Enoxaparin",1], headers ), RowItemModel( [12, -3, "Warfarin",1], headers ), RowItemModel( [13, -3, "Ceftriaxone",1], headers ), RowItemModel( [14, -4, "Foley Catheter",1], headers ), RowItemModel( [15, -4, "Strict I&O",1], headers ), RowItemModel( [16, -4, "Fall Precautions",1], headers ), ] for dataModel in dataModels: (dataItemId, isNew) = DBUtil.findOrInsertItem("clinical_item", dataModel) # Input file contents in Bag-of-Words formats # Specifically avoid the use of items 6 or 7 in the training data self.inputBOWFileStr = \ """[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5]] [[3,4],[4,4],[9,3],[10,2],[12,6],[13,3],[15,5],[16,8]] [[1,1],[2,2],[3,1],[4,4],[5,10],[8,5],[9,1],[10,2],[11,1],[12,4],[13,10],[14,1],[15,3],[16,5]] [[1,4],[2,9],[9,1],[10,2],[11,7],[12,4],[13,2],[16,6]] [[4,3],[5,31],[8,5],[12,6],[13,8],[16,5]] """ self.instance = TopicModel()
class TestTopicModel(DBTestCase): def setUp(self): """Prepare state for test cases""" DBTestCase.setUp(self) log.info("Populate the database with test data") from stride.clinical_item.ClinicalItemDataLoader import ClinicalItemDataLoader ClinicalItemDataLoader.build_clinical_item_psql_schemata() self.clinicalItemCategoryIdStrList = list() headers = ["clinical_item_category_id", "source_table"] dataModels = \ [ RowItemModel( [-1, "Labs"], headers ), RowItemModel( [-2, "Imaging"], headers ), RowItemModel( [-3, "Meds"], headers ), RowItemModel( [-4, "Nursing"], headers ), RowItemModel( [-5, "Problems"], headers ), RowItemModel( [-6, "Lab Results"], headers ), ] for dataModel in dataModels: (dataItemId, isNew) = DBUtil.findOrInsertItem("clinical_item_category", dataModel) self.clinicalItemCategoryIdStrList.append(str(dataItemId)) headers = [ "clinical_item_id", "clinical_item_category_id", "name", "analysis_status" ] dataModels = \ [ RowItemModel( [1, -1, "CBC",1], headers ), RowItemModel( [2, -1, "BMP",0], headers ), # Clear analysis status, so this will be ignored unless changed RowItemModel( [3, -1, "Hepatic Panel",1], headers ), RowItemModel( [4, -1, "Cardiac Enzymes",1], headers ), RowItemModel( [5, -2, "CXR",1], headers ), RowItemModel( [6, -2, "RUQ Ultrasound",1], headers ), RowItemModel( [7, -2, "CT Abdomen/Pelvis",1], headers ), RowItemModel( [8, -2, "CT PE Thorax",1], headers ), RowItemModel( [9, -3, "Acetaminophen",1], headers ), RowItemModel( [10, -3, "Carvedilol",1], headers ), RowItemModel( [11, -3, "Enoxaparin",1], headers ), RowItemModel( [12, -3, "Warfarin",1], headers ), RowItemModel( [13, -3, "Ceftriaxone",1], headers ), RowItemModel( [14, -4, "Foley Catheter",1], headers ), RowItemModel( [15, -4, "Strict I&O",1], headers ), RowItemModel( [16, -4, "Fall Precautions",1], headers ), ] for dataModel in dataModels: (dataItemId, isNew) = DBUtil.findOrInsertItem("clinical_item", dataModel) # Input file contents in Bag-of-Words formats # Specifically avoid the use of items 6 or 7 in the training data self.inputBOWFileStr = \ """[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5]] [[3,4],[4,4],[9,3],[10,2],[12,6],[13,3],[15,5],[16,8]] [[1,1],[2,2],[3,1],[4,4],[5,10],[8,5],[9,1],[10,2],[11,1],[12,4],[13,10],[14,1],[15,3],[16,5]] [[1,4],[2,9],[9,1],[10,2],[11,7],[12,4],[13,2],[16,6]] [[4,3],[5,31],[8,5],[12,6],[13,8],[16,5]] """ self.instance = TopicModel() # Instance to test on def tearDown(self): """Restore state from any setUp or test steps""" log.info("Purge test records from the database") DBUtil.execute( "delete from clinical_item where clinical_item_category_id < 0") DBUtil.execute( "delete from clinical_item_category where clinical_item_category_id in (%s)" % str.join(",", self.clinicalItemCategoryIdStrList)) for filename in os.listdir("."): if filename.startswith(TEST_FILE_PREFIX) or filename.startswith( "HDP" + TEST_FILE_PREFIX): os.remove(filename) DBTestCase.tearDown(self) def test_topicModel(self): # Run the modeling analysis against the mock test data above and verify expected stats afterwards. numTopics = 3 sys.stdin = StringIO(self.inputBOWFileStr) subargv = [ "TopicModel", "-n", str(numTopics), "-i", str(ITEMS_PER_TOPIC), "-", TEST_FILE_PREFIX ] self.instance.main(subargv) model = self.instance.loadModel(TEST_FILE_PREFIX) topTopicFile = open(self.instance.topTopicFilename(TEST_FILE_PREFIX)) expectedDocCountByWordId = \ {1:3, 2:3, 3:3, 4:4, 5:3, None:5, 9:3, 10:3, 11:2, 12:4, 13:4, 14:1, 15:2, 16:4, 8:3} self.assertExpectedTopItems(expectedDocCountByWordId, model, topTopicFile) # Do again but with HDP non-parametric model numTopics = 0 sys.stdin = StringIO(self.inputBOWFileStr) subargv = [ "TopicModel", "-n", str(numTopics), "-i", str(ITEMS_PER_TOPIC), "-", "HDP" + TEST_FILE_PREFIX ] self.instance.main(subargv) model = self.instance.loadModel("HDP" + TEST_FILE_PREFIX) topTopicFile = open( self.instance.topTopicFilename("HDP" + TEST_FILE_PREFIX)) expectedDocCountByWordId = \ {1:3, 2:3, 3:3, 4:4, 5:3, None:5, 9:3, 10:3, 11:2, 12:4, 13:4, 14:1, 15:2, 16:4, 8:3} self.assertExpectedTopItems(expectedDocCountByWordId, model, topTopicFile) def assertExpectedTopItems(self, expectedDocCountByWordId, model, topTopicFile): # With randomized optimization algorithm, cannot depend on stable # Test results with each run. Instead make sure internally consistent, # and that raw count data is consistent # Values from model topic parameters scoreByItemIdByTopicId = dict() for (topicId, topicItems) in self.instance.enumerateTopics( model, ITEMS_PER_TOPIC): scoreByItemIdByTopicId[topicId] = dict() for (itemId, score) in topicItems: scoreByItemIdByTopicId[topicId][itemId] = score # Add expected word document counts under the "None" topic scoreByItemIdByTopicId[None] = expectedDocCountByWordId # Verify Top Topic Files match topScoreByItemIdByTopicId = dict() itemsChecked = 0 reader = TabDictReader(topTopicFile) for topicItem in reader: topicId = None if topicItem["topic_id"] != NULL_STRING: topicId = int(topicItem["topic_id"]) itemId = None if topicItem["item_id"] != NULL_STRING: itemId = int(topicItem["item_id"]) score = float(topicItem["score"]) tfidf = float(topicItem["tfidf"]) expectedTFIDF = 0.0 if itemId in expectedDocCountByWordId and expectedDocCountByWordId[ itemId] > 0: expectedTFIDF = score * expectedDocCountByWordId[ None] / expectedDocCountByWordId[itemId] #print >> sys.stderr, topicId, itemId, score, tfidf, expectedDocCountByWordId[itemId] self.assertAlmostEqual(expectedTFIDF, tfidf, places=5) if topicId not in topScoreByItemIdByTopicId: topScoreByItemIdByTopicId[topicId] = dict() topScoreByItemIdByTopicId[topicId][itemId] = score itemsChecked += 1 self.assertTrue(itemsChecked > 0) # Make sure an actual test happened for topicId, topScoreByItemId in topScoreByItemIdByTopicId.items(): scoreByItemId = scoreByItemIdByTopicId[topicId] self.assertAlmostEqualsDict(topScoreByItemId, scoreByItemId, places=5)