Ejemplo n.º 1
0
    def computeTF(self, recompute = True, keysToTokenize = ["articleText"], keyToDisplay = "title"):
        """Compute the TF for every document in the database.  Keys to draw text from to tokenize are given in "keysToTokenize".  Optionally, don't recompute."""

        ifMap = """function(doc) { 
        if (!('tf' in doc)) 
            emit(doc._id, null); 
        }"""

        if recompute:
            results = self.db
        else:
            results = self.db.query(ifMap)

        self.logger.debug("Computing TF")
        for result in results:
            try:
                # if we're recomputing...
                if (result.find("_design") != -1):
                    continue
                doc = self.db[result]
            except AttributeError:
                # otherwise, just get the key
                doc = self.db[result["key"]]

            self.logger.debug("Computing TF: Working on \"%s\"" % doc[keyToDisplay])
            # Get the text to use
            textToTokenize = ""
            for key in keysToTokenize:
                textToTokenize += doc[key] + "\n"
            tokens = Text.tokenize(textToTokenize)
            numTokens = len(tokens)
            doc['numTokens'] = numTokens
            doc['tf'] = Text.get_term_freq(tokens)
            self.addDocument(doc)
Ejemplo n.º 2
0
    def getPPCInfo(self, text):
        """Get PPC info from our database from the string given."""

        tokens = Text.tokenize(text)
        
        documents = []
        for token in tokens:
            try:
                documents.append(self.db[token])
            except couchdb.client.ResourceNotFound:
                continue

        return documents