Beispiel #1
0
    def tfIdfIndex(self, data_2, field):
        '''Creates TF/IDF index of a given set of data'''
        predicate = next(iter(self.tfidf_fields[field]))

        index = predicate.index
        canopy = predicate.canopy

        if index is None:
            index = tfidf.TfIdfIndex(field, self.stop_words[field])
            canopy = {}

        for record_id, doc in data_2:
            index.index(record_id, doc)
            canopy[record_id] = (record_id, )

        for predicate in self.tfidf_fields[field]:
            predicate.index = index
            predicate.canopy = canopy
Beispiel #2
0
    def tfIdfBlock(self, data, field):
        '''Creates TF/IDF canopy of a given set of data'''

        indices = {}
        for predicate in self.tfidf_fields[field]:
            index = tfidf.TfIdfIndex(field, self.stop_words[field])
            indices[predicate] = index

        base_tokens = {}

        for record_id, doc in data:
            base_tokens[record_id] = doc
            for index in indices.values():
                index.index(record_id, doc)

        logger.info(time.asctime())

        for predicate in self.tfidf_fields[field]:
            logger.info("Canopy: %s", str(predicate))
            index = indices[predicate]
            predicate.canopy = index.canopy(base_tokens, predicate.threshold)

        logger.info(time.asctime())
Beispiel #3
0
 def initIndex(self):
     self.reset()
     return tfidf.TfIdfIndex()
Beispiel #4
0
 def initIndex(self, stop_words):
     return tfidf.TfIdfIndex(stop_words)
Beispiel #5
0
 def initIndex(self) :
     return tfidf.TfIdfIndex()