def __getitem__(self, key): previous_stop = self.stop result = super(ListPrefetch, self).__getitem__(key) if self.stop != previous_stop: # Cache-ahead try: if isinstance(key, six.integer_types): tail = super(ListPrefetch, self).__getitem__( slice(key + 1, key + self.prefetch_size + 1)) elif isinstance(key, slice): if key.stop: tail = super(ListPrefetch, self).__getitem__( slice(key.stop + 1, key.stop + self.prefetch_size + 1)) else: tail = [] except StopIteration: # If we've finished right at this element, that's not an error tail = [] # Fetching objects needed if isinstance(result, list): prefetch(result + tail) elif isinstance(result, Persistent): prefetch([result] + tail) return result
def index_doc(self, docid, text): if docid in self._docwords: return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) # XXX Counter is slow. If it is an issue, need to include C module # http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character widcnt = Counter(wids) widset = widcnt.keys() widcode = PersistentWid.encode_wid(wids if self.keep_phrases else widset) self._docwords[docid] = widcode if widset: weights, lengths = self._get_doctrees(widset) docscores = self._get_widscores(widcnt, docid) parallel_traversal(*zip(*[(weights[w], docscores[w]) for w in widset])) prefetch(list(lengths.values()) + [self.documentCount]) for w in widset: weights[w].add(docscores[w]) lengths[w].change(1) self.documentCount.change(1) return len(wids)
def applyInRange(self, start, end, excludemin=False, excludemax=False): # prefetch is a catalog or None values = self._fwd_index.values(start, end, excludemin=excludemin, excludemax=excludemax) prefetch(values) return self.family.IF.multiunion(values)
def __getitem__(self, key): previous_stop = self.stop result = super(ListPrefetch, self).__getitem__(key) if self.stop != previous_stop: # Cache-ahead try: if isinstance(key, six.integer_types): tail = super(ListPrefetch, self).__getitem__(slice(key + 1, key + self.prefetch_size + 1)) elif isinstance(key, slice): if key.stop: tail = super(ListPrefetch, self).__getitem__(slice(key.stop + 1, key.stop + self.prefetch_size + 1)) else: tail = [] except StopIteration: # If we've finished right at this element, that's not an error tail = [] # Fetching objects needed if isinstance(result, list): prefetch(result + tail) elif isinstance(result, Persistent): prefetch([result] + tail) return result
def mass_weightedUnion(L): """ Incremental version of mass_weightedUnion :param list L: (TreeSet((-score, docid)), weight) elements :returns: iterable ordered from large to small sum(score*weight) """ cache_size = 40 order_size = 15 order_violation = 3 cache_updated = None if len(L) == 0: return elif len(L) == 1: # Trivial tree, weight = L[0] # XXX need to make it possible to advance the tree N elements ahead! for (score, docid) in tree: yield (docid, -score * weight) else: # XXX make into an iterator class trees, weights = zip(*L) prefetch(trees) prefetch([x._firstbucket for x in trees if x._firstbucket is not None]) unread_max = [-t.minKey()[0] * w for t, w in L] lengths = map(len, trees) iters = dict(enumerate(map(iter, trees))) caches = [{} for i in range(len(L))] docid2cacheid = defaultdict(list) cache_len = None maxscores = [-1] * len(L) used = set() sorted_mins = SortedSet() # Contains tuples (-min_weight, docid) mins_dict = {} # {docid -> min_weight} docids = [] def precache(i, size): try: for j in xrange(size): score, docid = next(iters[i]) score = -score * weights[i] if unread_max[i] > score: unread_max[i] = score if docid not in used: caches[i][docid] = score docid2cacheid[docid].append(i) if maxscores[i] < 0: maxscores[i] = score # documents are not repeated in iterators # it means that for the iterator i we didn't meet docid before # which means that we should just add score to the current minscore total_score = mins_dict.get(docid, 0.0) + score if docid in mins_dict: sorted_mins.remove((-mins_dict[docid], docid)) mins_dict[docid] = total_score sorted_mins.add((-total_score, docid)) except StopIteration: del iters[i] unread_max[i] = 0 while True: # Main cycle in which we yield values to make an iterator # Advance iterators when needed / fill caches to keep them long enough # Perhaps, some better algorithm is needed to pre-read, this is the simplest cache_updated = False for i in list(iters.keys()): cache = caches[i] if len(cache) < cache_size / 2: precache(i, cache_size - len(cache)) cache_len = sum(map(len, caches)) cache_updated = True if cache_updated or (cache_len is not None and (cache_len > order_violation) and (len(docids) < order_violation)): while True: max_sum = sum(unread_max) mins = [] docids = [] maxs = [] for w, docid in islice(iter(sorted_mins), order_size): mins.append(-w) docids.append(docid) cacheids = docid2cacheid[docid] # Slower equivalent: # maxs = sum(c.get(docid, m) for c, m in izip(caches, unread_max)) # because # -w = sum(c.get(docid, 0) for c, m in izip(caches, unread_max)) maxs.append(max_sum - sum(unread_max[i] for i in cacheids) - w) violated = False for i in xrange(len(mins) - order_violation - 1): # Naive implementation. Can go from tail and do faster # in batches of size of order_violation for m in maxs[i + order_violation:]: if m > mins[i]: violated = True break if violated: break if len(mins) == order_size: # Last check: can the order be violated by out-of-cache elements? if mins[len(mins) - order_violation] < max_sum: violated = True if not violated: break else: # XXX instead we could download enough elements to change # sum(unread_max) by much enough # We need something like "advance_to" method precache( max(enumerate(m / max(l - len(c), 1) for m, l, c in izip(unread_max, lengths, caches)), key=lambda x: x[1])[0], cache_size / 2) cache_len = sum(map(len, caches)) if not docids: break minw = mins.pop(0) maxw = maxs.pop(0) docid = docids.pop(0) for i, c in enumerate(caches): if docid in c: if c[docid] == unread_max[i]: del c[docid] unread_max[i] = max(six.itervalues(c)) else: del c[docid] cache_len -= 1 used.add(docid) sorted_mins.remove((-minw, docid)) del mins_dict[docid] yield docid, (minw + maxw) / 2.0