def tfIdfBlock(self, data, field): '''Creates TF/IDF canopy of a given set of data''' class CustomStopWordRemover(object): stop_words = self.stop_words[field].copy() def process(self, lst): return [w for w in lst if not w in self.stop_words] splitter = Splitter() index = TextIndex(Lexicon(splitter, CustomStopWordRemover())) index.index = CosineIndex(index.lexicon) index_to_id = {} base_tokens = {} for i, (record_id, doc) in enumerate(data, 1) : index_to_id[i] = record_id base_tokens[i] = splitter.process([doc]) index.index_doc(i, doc) canopies = (tfidf._createCanopies(index, base_tokens, threshold, field) for threshold in self.tfidf_fields[field]) for canopy in canopies : key, index_canopy = canopy id_canopy = dict((index_to_id[k], index_to_id[v]) for k,v in index_canopy.iteritems()) self.canopies[key] = defaultdict(str, id_canopy)
def tfIdfBlock(self, data, field): '''Creates TF/IDF canopy of a given set of data''' class CustomStopWordRemover(object): stop_words = self.stop_words[field].copy() def process(self, lst): return [w for w in lst if not w in self.stop_words] index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover())) index.index = CosineIndex(index.lexicon) index_to_id = {} base_tokens = {} for i, (record_id, doc) in enumerate(data, 1) : index_to_id[i] = record_id base_tokens[i] = doc index.index_doc(i, doc) canopies = (tfidf._createCanopies(index, base_tokens, threshold, field) for threshold in self.tfidf_fields[field]) for canopy in canopies : key, index_canopy = canopy id_canopy = dict((index_to_id[k], index_to_id[v]) for k,v in index_canopy.iteritems()) self.canopies[key] = defaultdict(str, id_canopy)
def __init__(self, stop_words): super(CanopyLexicon, self).__init__() self._pipeline = [ Splitter(), CustomStopWordRemover(stop_words), OperatorEscaper() ]
def _makeOne(self, family=None): from zope.index.text.lexicon import Lexicon from zope.index.text.lexicon import Splitter if family is None: family = self._getBTreesFamily() lexicon = Lexicon(Splitter()) return self._getTargetClass()(lexicon, family=family)
def _default_indexes(): return { 'title': CatalogFieldIndex(get_title), 'description': CatalogFieldIndex(get_description), 'type_name': CatalogFieldIndex(get_type_name), 'sortable_title': CatalogFieldIndex(get_sortable_title), 'path': CatalogPathIndex(get_path), 'searchable_text': CatalogTextIndex(get_searchable_text, lexicon=Lexicon(Splitter(), CaseNormalizer())), 'uid': CatalogFieldIndex(get_uid), 'tags': CatalogKeywordIndex(get_tags), 'search_visible': CatalogFieldIndex(get_search_visible), 'date': CatalogFieldIndex(get_date), 'modified': CatalogFieldIndex(get_modified), 'created': CatalogFieldIndex(get_created), 'wf_state': CatalogFieldIndex(get_wf_state), 'workflow': CatalogFieldIndex(get_workflow), }.items()
def _makeIndexAndParser(self): from zope.index.text.lexicon import Lexicon from zope.index.text.lexicon import Splitter from zope.index.text.queryparser import QueryParser lexicon = Lexicon(Splitter()) parser = QueryParser(lexicon) index = FauxIndex() return index, parser
def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print("prescanning", n) m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1
def __init__(self, discriminator, lexicon=None, index=None): self._init_discriminator(discriminator) self._not_indexed = self.family.IF.Set() lexicon = lexicon or Lexicon(Splitter(), CaseNormalizer(), StopWordRemover()) index = index or OkapiIndex(lexicon, family=self.family) ZopeTextIndex.__init__(self, lexicon, index) self.clear()
def __init__(self, field, stop_words=[]): self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms
def __init__(self, lexicon=None, index=None): """Provisional constructor. This creates the lexicon and index if not passed in. """ _explicit_lexicon = True if lexicon is None: _explicit_lexicon = False lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover()) if index is None: index = OkapiIndex(lexicon) self.lexicon = _explicit_lexicon and lexicon or index.lexicon self.index = index
def __init__(self, discriminator, lexicon=None, index=None): _lexicon = lexicon if lexicon is None: _lexicon = Lexicon( Splitter(), CaseNormalizer(), StopWordRemover(), ) if index is None: index = OkapiIndex(_lexicon, family=self.family) super(TextIndex, self).__init__(discriminator, lexicon, index) if lexicon is None: self.lexicon = index.lexicon self.index = index self.clear()
def stopWords(data) : index = TextIndex(Lexicon(Splitter())) for i, (_, doc) in enumerate(data, 1) : index.index_doc(i, doc) doc_freq = [(len(index.index._wordinfo[wid]), word) for word, wid in index.lexicon.items()] doc_freq.sort(reverse=True) N = float(index.index.documentCount()) threshold = int(max(1000, N * 0.05)) stop_words = set([]) for frequency, word in doc_freq : if frequency > threshold : stop_words.add(word) else : break return stop_words
def _makeLexicon(self, *pipeline): from zope.index.text.lexicon import Lexicon from zope.index.text.lexicon import Splitter if not pipeline: pipeline = (Splitter(),) return Lexicon(*pipeline)
def _makeOne(self, *pipeline): from zope.index.text.lexicon import Splitter pipeline = (Splitter(), ) + pipeline return self._getTargetClass()(*pipeline)
def _makePipeline(self): from zope.index.text.lexicon import Splitter return (Splitter(), FakeStopWordRemover())
def _makePipeline(self): from zope.index.text.lexicon import Splitter return (Splitter(),)