def group(self, seq): sortIndex = self._sortIndex sortReverse = self._sortReverse ns = len(seq) ni = len(sortIndex) if ns >= 0.1 * ni: # result large compared to index -- sort via index handled = IISet() hn = 0 _load = getattr(sortIndex, '_load', None) if _load is None: # not an optimized index items = sortIndex.items() _load = lambda (x1, x2): x2 if sortReverse: items.reverse() elif sortReverse: gRO = getattr(sortIndex, 'getReverseOrder', None) items = gRO and gRO() if items is None: items = list(sortIndex._index.keys()) items.reverse() else: items = sortIndex._index.keys() for i in items: ids = intersection(seq, _load(i)) if ids: handled.update(ids) hn += len(ids) yield i, ids if hn != len(seq): yield None, difference(seq, handled) else: # result relatively small -- sort via result m = OOBTree() keyFor = getattr(sortIndex, 'keyForDocument', None) # work around "nogopip" bug: it defines "keyForDocument" as an integer if not callable(keyFor): # this will fail, when the index neither defines a reasonable # "keyForDocument" nor "documentToKeyMap". In this case, # the index cannot be used for sorting. keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc] noValue = IITreeSet() for doc in seq.keys(): try: k = keyFor(doc) except KeyError: noValue.insert(doc) continue l = m.get(k) if l is None: l = m[k] = IITreeSet() l.insert(doc) items = m.items() if sortReverse: items = list(items) items.reverse() for i in items: yield i if noValue: yield None, noValue
def group(self, seq): sortIndex = self._sortIndex; sortReverse = self._sortReverse ns = len(seq); ni = len(sortIndex) if ns >= 0.1 * ni: # result large compared to index -- sort via index handled = IISet(); hn = 0 _load = getattr(sortIndex, '_load', None) if _load is None: # not an optimized index items = sortIndex.items() _load = lambda (x1, x2): x2 if sortReverse: items.reverse() elif sortReverse: gRO = getattr(sortIndex, 'getReverseOrder', None) items = gRO and gRO() if items is None: items = list(sortIndex._index.keys()); items.reverse() else: items = sortIndex._index.keys() for i in items: ids = intersection(seq, _load(i)) if ids: handled.update(ids); hn += len(ids) yield i, ids if hn != len(seq): yield None, difference(seq, handled) else: # result relatively small -- sort via result m = OOBTree() keyFor = getattr(sortIndex, 'keyForDocument', None) # work around "nogopip" bug: it defines "keyForDocument" as an integer if not callable(keyFor): # this will fail, when the index neither defines a reasonable # "keyForDocument" nor "documentToKeyMap". In this case, # the index cannot be used for sorting. keyFor = lambda doc, map=sortIndex.documentToKeyMap(): map[doc] noValue = IITreeSet() for doc in seq.keys(): try: k = keyFor(doc) except KeyError: noValue.insert(doc); continue l = m.get(k) if l is None: l = m[k] = IITreeSet() l.insert(doc) items = m.items() if sortReverse: items = list(items); items.reverse() for i in items: yield i if noValue: yield None, noValue
def lookupWordsBySimilarity(self, word): """ perform a similarity lookup """ lst = self._lexicon.getSimiliarWords(word) docids = IISet() used_words = {} getwid = self._lexicon.getWordId for word, threshold in lst: used_words[word] = threshold wid = getwid(word) docids.update( self._storage.get(wid) ) return ResultSet(docids, used_words)
def _lookup(self, words, do_autoexpand=1): """ search a word or a list of words in the lexicon and return a ResultSet of found documents. """ docids = IISet() used_words = {} # remove stopwords from data if self.use_stopwords: words = self.use_stopwords.process( words ) if self.use_thesaurus and self.thesaurus_mode == 'expand_always': TH = ThesaurusRegistry.get(self.use_thesaurus) for word in words[:]: r = TH.getTermsFor(word) words.extend(r) for word in words: # perform casefolding if necessary if self.splitter_casefolding: word = word.lower() if self.use_normalizer: word = NormalizerRegistry.get(self.use_normalizer).process(word) used_words[word] = 1.0 wid = self._lexicon.getWordId(word) # Retrieve list of docIds for this wordid if wid is not None: docids.update( self._storage.get(wid) ) # perform autoexpansion of terms by performing # a search using right-truncation if do_autoexpand and self.autoexpand and len(word) >= self.autoexpand_limit: rs = self.lookupWordsByTruncation(word, right=1) docids.update(rs.docIds()) wlen = len(word) for w in rs.words().keys(): used_words[w] = TRUNC_WEIGHT[len(w)-wlen] return ResultSet(docids, used_words)
def group(self, seq): sortIndex = self._sortIndex; sortReverse = self._sortReverse ns = len(seq); ni = len(sortIndex) if ns >= 0.1 * ni: # result large compared to index -- sort via index handled = IISet(); hn = 0 _load = getattr(sortIndex, '_load', None) if _load is None: # not an optimized index items = sortIndex.items() _load = lambda (x1, x2): x2 if sortReverse: items.reverse() elif sortReverse: gRO = getattr(sortIndex, 'getReverseOrder', None) items = gRO and gRO() if items is None: items = list(sortIndex._index.keys()); items.reverse() else: items = sortIndex._index.keys() for i in items: ids = intersection(seq, _load(i)) if ids: handled.update(ids); hn += len(ids) yield i, ids if hn != len(seq): yield None, difference(seq, handled) else: # result relatively small -- sort via result keyFor = sortIndex.keyForDocument; m = OOBTree() noValue = IITreeSet() for doc in seq.keys(): try: k = keyFor(doc) except KeyError: noValue.insert(doc); continue l = m.get(k) if l is None: l = m[k] = IITreeSet() l.insert(doc) items = m.items() if sortReverse: items = list(items); items.reverse() for i in items: yield i if noValue: yield None, noValue