def computeTF(self, recompute = True, keysToTokenize = ["articleText"], keyToDisplay = "title"): """Compute the TF for every document in the database. Keys to draw text from to tokenize are given in "keysToTokenize". Optionally, don't recompute.""" ifMap = """function(doc) { if (!('tf' in doc)) emit(doc._id, null); }""" if recompute: results = self.db else: results = self.db.query(ifMap) self.logger.debug("Computing TF") for result in results: try: # if we're recomputing... if (result.find("_design") != -1): continue doc = self.db[result] except AttributeError: # otherwise, just get the key doc = self.db[result["key"]] self.logger.debug("Computing TF: Working on \"%s\"" % doc[keyToDisplay]) # Get the text to use textToTokenize = "" for key in keysToTokenize: textToTokenize += doc[key] + "\n" tokens = Text.tokenize(textToTokenize) numTokens = len(tokens) doc['numTokens'] = numTokens doc['tf'] = Text.get_term_freq(tokens) self.addDocument(doc)
def getPPCInfo(self, text): """Get PPC info from our database from the string given.""" tokens = Text.tokenize(text) documents = [] for token in tokens: try: documents.append(self.db[token]) except couchdb.client.ResourceNotFound: continue return documents