def mergeResults(results, has_sort_keys, reverse): """Sort/merge sub-results, generating a flat sequence. results is a list of result set sequences, all with or without sort keys """ if not has_sort_keys: return LazyCat(results) else: # Concatenate the catalog results into one list and sort it # Each result record consists of a list of tuples with three values: # (sortkey, docid, catalog__getitem__) if len(results) > 1: all = [] for r in results: all.extend(r) elif len(results) == 1: all = results[0] else: return [] all.sort() if reverse: all.reverse() return LazyMap(lambda rec: rec[2](rec[1]), all, len(all))
def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using a sort index. Return a lazy # result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples # # The two 'for' loops in here contribute a significant # proportion of the time to perform an indexed search. # Try to avoid all non-local attribute lookup inside # those loops. _intersection = intersection _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() _None = None _keyerror = KeyError result = [] append = result.append if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the resultset, reverse sorting # order and limit it, then reverse the resultset again switched_reverse = False if b_size and b_start and b_start > rlen / 2: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = b_end - b_start limit = b_start + b_size if merge and limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # This is rarely exercised in practice... length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = _intersection(rs, intset) if intset: keys = getattr(intset, 'keys', _None) if keys is not _None: # Is this ever true? intset = keys() length += len(intset) append((k, intset, _self__getitem__)) # Note that sort keys are unique. if reverse: result.sort(reverse=True) else: result.sort() sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: append((key, did, _self__getitem__)) # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result if merge: if reverse: result.sort(reverse=True) else: result.sort() if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif reverse: # Limit/sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] n = 0 worst = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif not reverse: # Limit/sort results using N-Best algorithm in reverse (N-Worst?) keys = [] n = 0 best = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count)
def search(self, query, sort_index=None, reverse=0, limit=None, merge=1): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjuntion with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" rs = None # resultset # Indexes fulfill a fairly large contract here. We hand each # index the query mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the query, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # Canonicalize the request into a sensible query before passing it on query = self.make_query(query) cr = self.getCatalogPlan(query) cr.start() plan = cr.plan() if not plan: plan = self._sorted_search_indexes(query) indexes = self.indexes.keys() for i in plan: if i not in indexes: # We can have bogus keys or the plan can contain index names # that have been removed in the meantime continue index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue cr.start_split(i) limit_result = ILimitedResultIndex.providedBy(index) if limit_result: r = _apply_index(query, rs) else: r = _apply_index(query) if r is not None: r, u = r # Short circuit if empty result # BBB: We can remove the "r is not None" check in Zope 2.14 # once we don't need to support the "return everything" case # anymore if r is not None and not r: cr.stop_split(i, result=None, limit=limit_result) return LazyCat([]) # provide detailed info about the pure intersection time intersect_id = i + '#intersection' cr.start_split(intersect_id) w, rs = weightedIntersection(rs, r) cr.stop_split(intersect_id) # consider the time it takes to intersect the index result with # the total resultset to be part of the index time cr.stop_split(i, result=r, limit=limit_result) if not rs: break else: cr.stop_split(i, result=None, limit=limit_result) # Try to deduce the sort limit from batching arguments b_start = int(query.get('b_start', 0)) b_size = query.get('b_size', None) if b_size is not None: b_size = int(b_size) if b_size is not None: limit = b_start + b_size elif limit and b_size is None: b_size = limit if rs is None: # None of the indexes found anything to do with the query # We take this to mean that the query was empty (an empty filter) # and so we return everything in the catalog warnings.warn('Your query %s produced no query restriction. ' 'Currently the entire catalog content is returned. ' 'In Zope 2.14 this will result in an empty LazyCat ' 'to be returned.' % repr(make_key(self, query)), DeprecationWarning, stacklevel=3) rlen = len(self) if sort_index is None: sequence, slen = self._limit_sequence(self.data.items(), rlen, b_start, b_size) result = LazyMap(self.instantiate, sequence, slen, actual_result_count=rlen) else: cr.start_split('sort_on') result = self.sortResults(self.data, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split('sort_on', None) elif rs: # We got some results from the indexes. # Sort and convert to sequences. # XXX: The check for 'values' is really stupid since we call # items() and *not* values() rlen = len(rs) if sort_index is None and hasattr(rs, 'values'): # having a 'values' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults # note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case getitem = self.__getitem__ result = [(score, (1, score, rid), getitem) for rid, score in rs.items()] else: cr.start_split('sort_on') rs = rs.byValue(0) # sort it by score max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item r=self._v_result_class(self.data[key])\ .__of__(aq_parent(self)) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = int(100. * score / max) return r sequence, slen = self._limit_sequence( rs, rlen, b_start, b_size) result = LazyMap(getScoredResult, sequence, slen, actual_result_count=rlen) cr.stop_split('sort_on', None) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(self.__getitem__, sequence, slen, actual_result_count=rlen) else: # sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. cr.start_split('sort_on') result = self.sortResults(rs, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split('sort_on', None) else: # Empty result set result = LazyCat([]) cr.stop() return result
def search(self, request, sort_index=None, reverse=0, limit=None, merge=1): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjuntion with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" rs = None # resultset # Indexes fulfill a fairly large contract here. We hand each # index the request mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the request, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # For hysterical reasons, if all indexes return None for a given # request (and no attributes were used) then we append all results # in the Catalog. This generally happens when the search values # in request are all empty strings or do not coorespond to any of # the indexes. # Note that if the indexes find query arguments, but the end result # is an empty sequence, we do nothing for i in self.indexes.keys(): index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue r = _apply_index(request) if r is not None: r, u = r w, rs = weightedIntersection(rs, r) if rs is None: # None of the indexes found anything to do with the request # We take this to mean that the query was empty (an empty filter) # and so we return everything in the catalog if sort_index is None: return LazyMap(self.instantiate, self.data.items(), len(self)) else: return self.sortResults(self.data, sort_index, reverse, limit, merge) elif rs: # We got some results from the indexes. # Sort and convert to sequences. # XXX: The check for 'values' is really stupid since we call # items() and *not* values() if sort_index is None and hasattr(rs, 'values'): # having a 'values' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults # note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case getitem = self.__getitem__ return [(score, (1, score, rid), getitem) for rid, score in rs.items()] rs = rs.byValue(0) # sort it by score max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item r=self._v_result_class(self.data[key])\ .__of__(self.aq_parent) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = int(100. * score / max) return r return LazyMap(getScoredResult, rs, len(rs)) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() return LazyMap(self.__getitem__, rs, len(rs)) else: # sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. return self.sortResults(rs, sort_index, reverse, limit, merge) else: # Empty result set return LazyCat([])