Example #1
0
    def tfIdfBlock(self, data, field): 
        '''Creates TF/IDF canopy of a given set of data'''

        class CustomStopWordRemover(object):
            stop_words = self.stop_words[field].copy()

            def process(self, lst):
                return [w for w in lst if not w in self.stop_words]

        splitter = Splitter()

        index = TextIndex(Lexicon(splitter, CustomStopWordRemover()))

        index.index = CosineIndex(index.lexicon)


        index_to_id = {}
        base_tokens = {}

        for i, (record_id, doc) in enumerate(data, 1) :
            index_to_id[i] = record_id
            base_tokens[i] = splitter.process([doc])
            index.index_doc(i, doc)

        canopies = (tfidf._createCanopies(index,
                                          base_tokens, 
                                          threshold, 
                                          field)
                    for threshold in self.tfidf_fields[field])

        for canopy in canopies :
            key, index_canopy = canopy
            id_canopy = dict((index_to_id[k], index_to_id[v]) 
                             for k,v in index_canopy.iteritems())
            self.canopies[key] = defaultdict(str, id_canopy)
Example #2
0
    def tfIdfBlock(self, data, field): 
        '''Creates TF/IDF canopy of a given set of data'''

        class CustomStopWordRemover(object):
            stop_words = self.stop_words[field].copy()

            def process(self, lst):
                return [w for w in lst if not w in self.stop_words]

        index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover()))

        index.index = CosineIndex(index.lexicon)

        index_to_id = {}
        base_tokens = {}

        for i, (record_id, doc) in enumerate(data, 1) :
            index_to_id[i] = record_id
            base_tokens[i] = doc
            index.index_doc(i, doc)

        canopies = (tfidf._createCanopies(index,
                                          base_tokens, 
                                          threshold, 
                                          field)
                    for threshold in self.tfidf_fields[field])

        for canopy in canopies :
            key, index_canopy = canopy
            id_canopy = dict((index_to_id[k], index_to_id[v]) 
                             for k,v in index_canopy.iteritems())
            self.canopies[key] = defaultdict(str, id_canopy)
Example #3
0
 def __init__(self, stop_words):
     super(CanopyLexicon, self).__init__()
     self._pipeline = [
         Splitter(),
         CustomStopWordRemover(stop_words),
         OperatorEscaper()
     ]
 def _makeOne(self, family=None):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     if family is None:
         family = self._getBTreesFamily()
     lexicon = Lexicon(Splitter())
     return self._getTargetClass()(lexicon, family=family)
Example #5
0
def _default_indexes():
    return {
        'title':
        CatalogFieldIndex(get_title),
        'description':
        CatalogFieldIndex(get_description),
        'type_name':
        CatalogFieldIndex(get_type_name),
        'sortable_title':
        CatalogFieldIndex(get_sortable_title),
        'path':
        CatalogPathIndex(get_path),
        'searchable_text':
        CatalogTextIndex(get_searchable_text,
                         lexicon=Lexicon(Splitter(), CaseNormalizer())),
        'uid':
        CatalogFieldIndex(get_uid),
        'tags':
        CatalogKeywordIndex(get_tags),
        'search_visible':
        CatalogFieldIndex(get_search_visible),
        'date':
        CatalogFieldIndex(get_date),
        'modified':
        CatalogFieldIndex(get_modified),
        'created':
        CatalogFieldIndex(get_created),
        'wf_state':
        CatalogFieldIndex(get_wf_state),
        'workflow':
        CatalogFieldIndex(get_workflow),
    }.items()
Example #6
0
 def _makeIndexAndParser(self):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     from zope.index.text.queryparser import QueryParser
     lexicon = Lexicon(Splitter())
     parser = QueryParser(lexicon)
     index = FauxIndex()
     return index, parser
Example #7
0
 def prescan(self, f, msgs, uniqwords):
     pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
     for n in msgs:
         print("prescanning", n)
         m = f.openmessage(n)
         text = self.getmessagetext(m, f.name)
         for p in pipeline:
             text = p.process(text)
         for word in text:
             uniqwords[word] = uniqwords.get(word, 0) + 1
Example #8
0
    def __init__(self, discriminator, lexicon=None, index=None):
        self._init_discriminator(discriminator)

        self._not_indexed = self.family.IF.Set()

        lexicon = lexicon or Lexicon(Splitter(), CaseNormalizer(),
                                     StopWordRemover())
        index = index or OkapiIndex(lexicon, family=self.family)

        ZopeTextIndex.__init__(self, lexicon, index)
        self.clear()
Example #9
0
    def __init__(self, field, stop_words=[]):
        self.field = field

        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms
Example #10
0
    def __init__(self, lexicon=None, index=None):
        """Provisional constructor.

        This creates the lexicon and index if not passed in.
        """
        _explicit_lexicon = True
        if lexicon is None:
            _explicit_lexicon = False
            lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
        if index is None:
            index = OkapiIndex(lexicon)
        self.lexicon = _explicit_lexicon and lexicon or index.lexicon
        self.index = index
Example #11
0
 def __init__(self, discriminator, lexicon=None, index=None):
     _lexicon = lexicon
     if lexicon is None:
         _lexicon = Lexicon(
             Splitter(),
             CaseNormalizer(),
             StopWordRemover(),
         )
     if index is None:
         index = OkapiIndex(_lexicon, family=self.family)
     super(TextIndex, self).__init__(discriminator, lexicon, index)
     if lexicon is None:
         self.lexicon = index.lexicon
     self.index = index
     self.clear()
Example #12
0
def stopWords(data) :
    index = TextIndex(Lexicon(Splitter()))

    for i, (_, doc) in enumerate(data, 1) :
        index.index_doc(i, doc)

    doc_freq = [(len(index.index._wordinfo[wid]), word) 
                for word, wid in index.lexicon.items()]

    doc_freq.sort(reverse=True)

    N = float(index.index.documentCount())
    threshold = int(max(1000, N * 0.05))

    stop_words = set([])

    for frequency, word in doc_freq :
        if frequency > threshold :
            stop_words.add(word)
        else :
            break

    return stop_words
Example #13
0
 def _makeLexicon(self, *pipeline):
     from zope.index.text.lexicon import Lexicon
     from zope.index.text.lexicon import Splitter
     if not pipeline:
         pipeline = (Splitter(),)
     return Lexicon(*pipeline)
Example #14
0
 def _makeOne(self, *pipeline):
     from zope.index.text.lexicon import Splitter
     pipeline = (Splitter(), ) + pipeline
     return self._getTargetClass()(*pipeline)
 def _makePipeline(self):
     from zope.index.text.lexicon import Splitter
     return (Splitter(), FakeStopWordRemover())
 def _makePipeline(self):
     from zope.index.text.lexicon import Splitter
     return (Splitter(),)