Example #1
0
def mass_weightedIntersection(L):
    "A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
    L = [(x, wx) for (x, wx) in L if x is not None]
    if len(L) < 2:
        return _trivial(L)
    # Intersect with smallest first.  We expect the input maps to be
    # IIBuckets, so it doesn't hurt to get their lengths repeatedly
    # (len(Bucket) is fast; len(BTree) is slow).
    L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
    (x, wx), (y, wy) = L[:2]
    dummy, result = weightedIntersection(x, y, wx, wy)
    for x, wx in L[2:]:
        dummy, result = weightedIntersection(result, x, 1, wx)
    return result
    def defaultSearch(self, req, expectedValues=None, verbose=False):

        rs = None
        for index in self._indexes:
            st = time()
            duration = (time() - st) * 1000

            limit_result = ILimitedResultIndex.providedBy(index)
            if limit_result:
                r = index._apply_index(req, rs)
            else:
                r = index._apply_index(req)
            duration = (time() - st) * 1000

            if r is not None:
                r, u = r
                w, rs = weightedIntersection(rs, r)
                if not rs:
                    break

            if verbose and (index.id in req):
                logger.info('index %s: %s hits in %3.2fms',
                            index.id, r and len(r) or 0, duration)

        if not rs:
            return set()

        try:
            rs = rs.keys()
        except AttributeError:
            pass

        return set(rs)
Example #3
0
    def count(self, brains, sequence=None):
        """ Intersect results
        """
        res = {}
        if not sequence:
            sequence = self.years

        if not sequence:
            return res

        index_id = 'effective'
        ctool = getToolByName(self.context, 'portal_catalog')
        index = ctool._catalog.getIndex(index_id)
        ctool = queryUtility(IFacetedCatalog)
        if not ctool:
            return res
        brains = IISet(brain.getRID() for brain in brains)
        res[""] = res['all'] = len(brains)
        for value in sequence:
            if not value:
                res[value] = len(brains)
                continue
            year = int(value)
            start = DateTime(year, 1, 1)
            end = DateTime(year, 12, 31).latestTime()
            query = {
                'query': (start, end),
                'range': 'min:max'
            }
            rset = ctool.apply_index(self.context, index, query)[0]
            rset = IISet(rset)
            rset = weightedIntersection(brains, rset)[1]
            res[value] = len(rset)
        return res
Example #4
0
    def count(self, brains, sequence=None):
        """ Intersect results
        """
        res = {}
        # by checking for facet_counts we assume this is a SolrResponse
        # from collective.solr
        if hasattr(brains, 'facet_counts'):
            facet_fields = brains.facet_counts.get('facet_fields')
            if facet_fields:
                index_id = self.data.get('index')
                facet_field = facet_fields.get(index_id, {})
                for value, num in facet_field.items():
                    if isinstance(value, unicode):
                        res[value] = num
                    else:
                        unicode_value = value.decode('utf-8')
                    res[unicode_value] = num
            else:
                # no facet counts were returned. we exit anyway because
                # zcatalog methods throw an error on solr responses
                return res
            res[""] = res['all'] = len(brains)
            return res
        else:
            # this is handled by the zcatalog. see below
            pass

        if not sequence:
            sequence = [key for key, value in self.vocabulary()]

        if not sequence:
            return res

        index_id = self.data.get('index')
        if not index_id:
            return res

        ctool = getToolByName(self.context, 'portal_catalog')
        index = ctool._catalog.getIndex(index_id)
        ctool = queryUtility(IFacetedCatalog)
        if not ctool:
            return res

        brains = IISet(brain.getRID() for brain in brains)
        res[""] = res['all'] = len(brains)
        for value in sequence:
            item = uuidToCatalogBrain(value)
            if not item:
                res[value] = len(brains)
                continue
            rset = ctool.apply_index(self.context, index, item.getPath())[0]
            rset = IISet(rset)
            rset = weightedIntersection(brains, rset)[1]
            if isinstance(value, unicode):
                res[value] = len(rset)
            else:
                unicode_value = value.decode('utf-8')
                res[unicode_value] = len(rset)
        return res
Example #5
0
def mass_weightedIntersection(l):
    "A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
    l = [(x, wx) for (x, wx) in l if x is not None]
    if len(l) < 2:
        return _trivial(l)
    # Intersect with smallest first. We expect the input maps to be
    # IIBuckets, so it doesn't hurt to get their lengths repeatedly
    # (len(Bucket) is fast; len(BTree) is slow).

    def _key(value):
        return len(value)

    l.sort(key=_key)
    (x, wx), (y, wy) = l[:2]
    dummy, result = weightedIntersection(x, y, wx, wy)
    for x, wx in l[2:]:
        dummy, result = weightedIntersection(result, x, 1, wx)
    return result
    def _apply_index(self, request, cid=''):
        """ Apply the index to query parameters given in the argument,
        request

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """

        record = parseIndexRequest(request,self.id,self.query_options)
        if record.keys==None: return None

        # Changed for 2.4
        # We use the default operator that can me managed via the ZMI

        qop = record.get('operator', self.useOperator)

        # We keep this for pre-2.4 compatibility
        # This stinking code should go away somewhere. A global
        # textindex_operator makes no sense when using multiple
        # text indexes inside a catalog. An index operator should
        # should be specified on a per-index base

        if request.has_key('textindex_operator'):
            qop = request['textindex_operator']
            warnings.warn("The usage of the 'textindex_operator' "
                          "is no longer recommended.\n"
                          "Please use a mapping object and the "
                          "'operator' key to specify the operator.")

        query_operator = operator_dict.get(qop)
        if query_operator is None:
            raise exceptions.RuntimeError, ("Invalid operator '%s' "
                                            "for a TextIndex" % escape(qop))
        r = None

        for key in record.keys:
            key = key.strip()
            if not key:
                continue

            b = self.query(key, query_operator).bucket()
            w, r = weightedIntersection(r, b)

        if r is not None:
            return r, (self.id,)

        return (IIBucket(), (self.id,))
def filter_rids(catalog_tool, filters):
    ret = None

    for index_id in catalog_tool._catalog.indexes.keys():
        index = catalog_tool._catalog.getIndex(index_id)
        r = index._apply_index(filters)
        if r is not None:
            r, _ = r
            _, ret = weightedIntersection(ret, r)

    return ret
Example #8
0
    def _apply_index(self, request, cid=''):
        """ Apply the index to query parameters given in the argument,
        request

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """
        if request.has_key(self.id):
            keys = request[self.id]
        else:
            return None

        operators = {
            'andnot':AndNot,
            'and':And,
            'near':Near,
            'or':Or
            }

        query_operator = Or
        # We default to 'or' if we aren't passed an operator in the request
        # or if we can't make sense of the passed-in operator

        if request.has_key('textindex_operator'):
            op=string.lower(str(request['textindex_operator']))
            query_operator = operators.get(op, query_operator)

        if type(keys) is StringType:
            if not keys or not string.strip(keys):
                return None
            keys = [keys]

        r = None

        for key in keys:
            key = string.strip(key)
            if not key:
                continue

            b = self.query(key, query_operator).bucket()
            w, r = weightedIntersection(r, b)

        if r is not None:
            return r, (self.id,)

        return (IIBucket(), (self.id,))
    def _search_index(self, cr, index_id, query, rs):
        cr.start_split(index_id)

        index_rs = None
        index = self.getIndex(index_id)
        limit_result = ILimitedResultIndex.providedBy(index)

        if IQueryIndex.providedBy(index):
            index_query = IndexQuery(query, index.id, index.query_options,
                                     index.operators, index.useOperator)
            if index_query.keys is not None:
                index_rs = index.query_index(index_query, rs)
        else:
            if limit_result:
                index_result = index._apply_index(query, rs)
            else:
                index_result = index._apply_index(query)

            # Parse (resultset, used_attributes) index return value.
            if index_result:
                index_rs, _ = index_result

        if not index_rs:
            # Short circuit if empty index result.
            rs = None
        else:
            # Provide detailed info about the pure intersection time.
            intersect_id = index_id + '#intersection'
            cr.start_split(intersect_id)
            # weightedIntersection preserves the values from any mappings
            # we get, as some indexes don't return simple sets.
            if hasattr(rs, 'items') or hasattr(index_rs, 'items'):
                _, rs = weightedIntersection(rs, index_rs)
            else:
                rs = intersection(rs, index_rs)

            cr.stop_split(intersect_id)

        # Consider the time it takes to intersect the index result
        # with the total result set to be part of the index time.
        cr.stop_split(index_id, result=index_rs, limit=limit_result)

        return rs
    def count(self, brains, sequence=None):
        """ Intersect results
        """
        res = {}
        if not sequence:
            sequence = [key for key, value in self.vocabulary()]

        if not sequence:
            return res

        index_id = self.data.get('index')
        if not index_id:
            return res

        ctool = getToolByName(self.context, 'portal_catalog')
        index = ctool._catalog.getIndex(index_id)
        ctool = queryUtility(IFacetedCatalog)
        if not ctool:
            return res

        brains = IISet(brain.getRID() for brain in brains)
        res[""] = res['all'] = len(brains)
        for value in sequence:
            if not value:
                res[value] = len(brains)
                continue
            if isinstance(value, unicode):
                try:
                    value = value.encode('utf-8')
                except Exception:
                    continue
            rset = ctool.apply_index(self.context, index, value)[0]
            rset = IISet(rset)
            rset = weightedIntersection(brains, rset)[1]
            if isinstance(value, str):
                try:
                    value = value.decode('utf-8')
                except Exception:
                    continue
            res[value] = len(rset)
        return res
Example #11
0
    def apply_index(self, index, value):
        """ Apply index according with portal type mapping
        """
        index_id = index.getId()
        if index_id != 'portal_type':
            return self._apply_index(index, value)

        if value not in self.context.objectIds():
            return self._apply_index(index, value)

        facet = self.context._getOb(value)

        rset = IIBucket()
        ptype = getattr(facet, 'search_type', None)
        if ptype:
            rset = self._apply_index(index, ptype)
            if rset:
                rset = IISet(rset[0])

        index = self.catalog._catalog.getIndex('object_provides')
        if not index:
            return rset, (index_id,)

        interface = getattr(facet, 'search_interface', None)
        if not interface:
            return rset, (index_id,)

        oset = self._apply_index(index, interface)
        if not oset:
            return rset, (index_id,)

        oset = IISet(oset[0])

        if not rset:
            return oset, (index_id,)

        rset = weightedIntersection(rset, oset)[1]

        return rset, (index_id,)
Example #12
0
    def search(self,
            query, sort_index=None, reverse=False, limit=None, merge=True):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjuntion with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        # Indexes fulfill a fairly large contract here. We hand each
        # index the query mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the query, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # Canonicalize the request into a sensible query before passing it on
        query = self.make_query(query)

        cr = self.getCatalogPlan(query)
        cr.start()

        plan = cr.plan()
        if not plan:
            plan = self._sorted_search_indexes(query)

        rs = None  # result set
        indexes = self.indexes.keys()
        for i in plan:
            if i not in indexes:
                # We can have bogus keys or the plan can contain index names
                # that have been removed in the meantime
                continue

            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue

            cr.start_split(i)
            limit_result = ILimitedResultIndex.providedBy(index)
            if limit_result:
                r = _apply_index(query, rs)
            else:
                r = _apply_index(query)

            if r is not None:
                r, u = r
                # Short circuit if empty result
                # BBB: We can remove the "r is not None" check in Zope 4
                # once we don't need to support the "return everything" case
                # anymore
                if r is not None and not r:
                    cr.stop_split(i, result=None, limit=limit_result)
                    return LazyCat([])

                # provide detailed info about the pure intersection time
                intersect_id = i + '#intersection'
                cr.start_split(intersect_id)
                # weightedIntersection preserves the values from any mappings
                # we get, as some indexes don't return simple sets
                if hasattr(rs, 'items') or hasattr(r, 'items'):
                    _, rs = weightedIntersection(rs, r)
                else:
                    rs = intersection(rs, r)

                cr.stop_split(intersect_id)

                # consider the time it takes to intersect the index result
                # with the total result set to be part of the index time
                cr.stop_split(i, result=r, limit=limit_result)
                if not rs:
                    break
            else:
                cr.stop_split(i, result=None, limit=limit_result)

        # Try to deduce the sort limit from batching arguments
        b_start = int(query.get('b_start', 0))
        b_size = query.get('b_size', None)
        if b_size is not None:
            b_size = int(b_size)

        if b_size is not None:
            limit = b_start + b_size
        elif limit and b_size is None:
            b_size = limit

        if sort_index is None:
            sort_report_name = None
        else:
            if isinstance(sort_index, list):
                sort_name = '-'.join(i.getId() for i in sort_index)
            else:
                sort_name = sort_index.getId()
            if isinstance(reverse, list):
                reverse_name = '-'.join(
                    'desc' if r else 'asc' for r in reverse)
            else:
                reverse_name = 'desc' if reverse else 'asc'
            sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name
            if limit is not None:
                sort_report_name += '#limit-%s' % limit

        if rs is None:
            # None of the indexes found anything to do with the query
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            warnings.warn('Your query %s produced no query restriction. '
                          'Currently the entire catalog content is returned. '
                          'In Zope 4 this will result in an empty LazyCat '
                          'to be returned.' % repr(cr.make_key(query)),
                          DeprecationWarning, stacklevel=3)

            rlen = len(self)
            if sort_index is None:
                sequence, slen = self._limit_sequence(self.data.items(), rlen,
                    b_start, b_size)
                result = LazyMap(self.instantiate, sequence, slen,
                    actual_result_count=rlen)
            else:
                cr.start_split(sort_report_name)
                result = self.sortResults(
                    self.data, sort_index, reverse, limit, merge,
                        actual_result_count=rlen, b_start=b_start,
                        b_size=b_size)
                cr.stop_split(sort_report_name, None)
        elif rs:
            # We got some results from the indexes.
            # Sort and convert to sequences.
            # XXX: The check for 'values' is really stupid since we call
            # items() and *not* values()
            rlen = len(rs)
            if sort_index is None and hasattr(rs, 'items'):
                # having a 'items' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.

                if not merge:
                    # Don't bother to sort here, return a list of
                    # three tuples to be passed later to mergeResults
                    # note that data_record_normalized_score_ cannot be
                    # calculated and will always be 1 in this case
                    getitem = self.__getitem__
                    result = [(score, (1, score, rid), getitem)
                            for rid, score in rs.items()]
                else:
                    cr.start_split('sort_on#score')

                    # sort it by score
                    rs = rs.byValue(0)
                    max = float(rs[0][0])

                    # Here we define our getter function inline so that
                    # we can conveniently store the max value as a default arg
                    # and make the normalized score computation lazy
                    def getScoredResult(item, max=max, self=self):
                        """
                        Returns instances of self._v_brains, or whatever is
                        passed into self.useBrains.
                        """
                        score, key = item
                        data = self.data[key]
                        klass = self._v_result_class
                        schema_len = len(klass.__record_schema__)
                        norm_score = int(100.0 * score / max)
                        if schema_len == len(data) + 3:
                            r = klass(tuple(data) + (key, score, norm_score))
                        else:
                            r = klass(data)
                            r.data_record_id_ = key
                            r.data_record_score_ = score
                            r.data_record_normalized_score_ = norm_score
                        r = r.__of__(aq_parent(self))
                        return r

                    sequence, slen = self._limit_sequence(rs, rlen, b_start,
                        b_size)
                    result = LazyMap(getScoredResult, sequence, slen,
                        actual_result_count=rlen)
                    cr.stop_split('sort_on#score', None)

            elif sort_index is None and not hasattr(rs, 'values'):
                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
                sequence, slen = self._limit_sequence(rs, rlen, b_start,
                    b_size)
                result = LazyMap(self.__getitem__, sequence, slen,
                    actual_result_count=rlen)
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
                cr.start_split(sort_report_name)
                result = self.sortResults(rs, sort_index, reverse, limit,
                    merge, actual_result_count=rlen, b_start=b_start,
                    b_size=b_size)
                cr.stop_split(sort_report_name, None)
        else:
            # Empty result set
            result = LazyCat([])
        cr.stop()
        return result
Example #13
0
 def __and__(self, x):
     return self.__class__(
         weightedIntersection(self._dict, x._dict)[1],
         union(self._words, x._words),
         self._index,
         )
Example #14
0
def getClusters(catalog_tool, filters):
    # the objects are searched for in the tile limits (to get the same clusters every time)
    grid_size = 12 # geopoints' and clusters' density on map / also depends on map frame size

    # unpack map limits
    if filters:
        lat_min = float(filters[0]['geo_latitude']['query'][0])
        lat_max = float(filters[0]['geo_latitude']['query'][1])

        lon_min = float(filters[0]['geo_longitude']['query'][0])
        lon_max = float(filters[0]['geo_longitude']['query'][1])
    else: # this should not happen
        return [], []

    tlat_min, tlat_max, tlon_min, tlon_max = clusters.get_discretized_limits(lat_min, lat_max, lon_min, lon_max, grid_size)

    catalog = catalog_tool._catalog

    # getting the inner indexes for lat and lon
    lat_index = catalog.getIndex('geo_latitude')._index
    lon_index = catalog.getIndex('geo_longitude')._index

    # adjust to cover results outside frame, but very close to margins
    # trying to fix cluster flickering near margins

    # applying the lat and lon indexes to get the rids
    rs = None
    lat_set, lat_dict = _apply_index_with_range_dict_results(lat_index, Decimal(str(tlat_min)), Decimal(str(tlat_max)))
    w, rs = weightedIntersection(rs, lat_set)

    lon_set, lon_dict = _apply_index_with_range_dict_results(lon_index, Decimal(str(tlon_min)), Decimal(str(tlon_max)))
    w, rs = weightedIntersection(rs, lon_set)

    rs_final = None
    # OR the filters and apply the index for each one
    for f in filters:
        rs_f = rs

        #adjust geo limits in filters to be consistent with discretized tile limits
        f['geo_longitude']['query'] = (Decimal(str(tlon_min)), Decimal(str(tlon_max)))
        f['geo_latitude']['query'] = (Decimal(str(tlat_min)), Decimal(str(tlat_max)))

        #this code is from the search function in the catalog implementation in Zope
        for i in catalog.indexes.keys():
            index = catalog.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue
            r = _apply_index(f)

            if r is not None:
                r, u = r
                w, rs_f = weightedIntersection(rs_f, r)

        w, rs_final = weightedUnion(rs_f, rs_final)

    r_list = list(rs_final)

    # transform objects to points
    points = []
    for i in range(len(r_list)):
        points.append(clusters.Point(i, float(lat_dict[r_list[i]]), float(lon_dict[r_list[i]])))

    centers, groups = clusters.kmeans(tlat_min, tlat_max, tlon_min, tlon_max, points, grid_size)

    # transform group points to rids
    for i in range(len(groups)):
        groups[i] = map(lambda p: r_list[p.id], groups[i])

    return centers, groups
def search(self, request, sort_index=None, reverse=0, limit=None, merge=1):
    advancedtypes = tuple(ADVANCEDTYPES)
    rs = None # resultset

    # Note that if the indexes find query arguments, but the end result
    # is an empty sequence, we do nothing

    prioritymap = getattr(self, '_v_prioritymap', None)
    if prioritymap is None:
        identifier = '/'.join(self.getPhysicalPath())
        if DEFAULT_PRIORITYMAP is not None:
            default = DEFAULT_PRIORITYMAP.get(identifier, None)
            logger.info('initializing priority map for %r from default (thread %s)',
                identifier, currentThread().getName())
            if default is not None:
                prioritymap = self._v_prioritymap = default.copy()
            else:
                prioritymap = self._v_prioritymap = {}
            valueidentifier = identifier + ':valueindexes'
            valuedefault = DEFAULT_PRIORITYMAP.get(valueidentifier, None)
            if valuedefault is not None:
                self._v_valueindexes = valuedefault.copy()
        else:
            logger.info('initializing empty priority map for %r (thread %s)',
                identifier, currentThread().getName())
            prioritymap = self._v_prioritymap = {}

    valueindexes = getattr(self, '_v_valueindexes', None)
    if valueindexes is None:
        valueindexes = self._v_valueindexes = determine_value_indexes(self)

    existing_indexes = self.indexes.keys()

    # What follows is a bit of a mess, but the ZCatalog API supports passing
    # in query restrictions in almost arbitary ways
    if isinstance(request, dict):
        keydict = request.copy()
    else:
        keydict = {}
        keydict.update(request.keywords)
        real_req = request.request
        if isinstance(real_req, dict):
            keydict.update(real_req)
        known_keys = keydict.keys()
        # The request has too many places where an index restriction might be
        # specified. Putting all of request.form, request.other, ... into the
        # key isn't what we want either, so we iterate over all known indexes
        # instead and see if they are in the request.
        for iid in existing_indexes:
            if iid in known_keys:
                continue
            value = real_req.get(iid)
            if value:
                keydict[iid] = value

    key = keys = keydict.keys()
    values = [name for name in keys if name in valueindexes]
    if values:
        # If we have indexes whose values should be considered, we first
        # preserve all normal indexes and then add the keys whose values
        # matter including their value into the key
        key = [name for name in keys if name not in values]
        for name in values:
            # We need to make sure the key is immutable, repr() is an easy way
            # to do this without imposing restrictions on the types of values
            key.append((name, repr(keydict.get(name, ''))))

    key = tuple(sorted(key))
    indexes = prioritymap.get(key, [])
    start = time()
    index_times = {}

    if not indexes:
        pri = []
        for i in existing_indexes:
            if i not in keys:
                # Do not ask indexes to restrict the result, which aren't part
                # of the query
                continue
            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue
            r = _apply_index(request)

            result_len = 0
            if r is not None:
                r, u = r
                result_len = len(r)
                w, rs = weightedIntersection(rs, r)
            pri.append((isinstance(index, advancedtypes), result_len, i))

        pri.sort()
        prioritymap[key] = [p[-1] for p in pri]

    else:
        for i in indexes:
            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue
            index_times[i] = time()
            if isinstance(index, advancedtypes):
                r = _apply_index(request, res=rs)
            else:
                r = _apply_index(request)
            index_times[i] = time() - index_times[i]

            if r is not None:
                # Short circuit if empty result
                r, u = r
                if not r:
                    return LazyCat([])
                if rs is None:
                    rs = r
                # Because weightedIntersection isn't optimized we only use it if necessary
                elif isinstance(rs, (IIBucket, IIBTree)) or isinstance(r, (IIBucket, IIBTree)):
                    _i = '%s_weightedIntersection'%i
                    index_times[_i] = time()
                    w, rs = weightedIntersection(rs, r)
                    index_times[_i] = time() - index_times[_i]
                else:
                    _i = '%s_intersection'%i
                    index_times[_i] = time()
                    rs = intersection(rs, r)
                    index_times[_i] = time() - index_times[_i]
    duration =  time() - start
    if LOG_SLOW_QUERIES and duration >= LONG_QUERY_TIME:
        detailed_times = []
        for i, t in index_times.items():
            detailed_times.append("%s : %3.2fms" % (i, t*1000))
        info = 'query: %3.2fms, priority: %s, key: %s' % (duration*1000, indexes, key)
        if detailed_times:
            info += ', detailed: %s' % (', '.join(detailed_times))
        logger.info(info)

    # Try to deduce the sort limit from batching arguments
    b_start = int(keydict.get('b_start', 0))
    b_size = keydict.get('b_size', None)
    if b_size is not None:
        b_size = int(b_size)

    if b_size is not None:
        limit = b_start + b_size
    elif limit and b_size is None:
        b_size = limit

    if rs is None:
        # None of the indexes found anything to do with the request
        # We take this to mean that the query was empty (an empty filter)
        # and so we return everything in the catalog
        rlen = len(self)
        if sort_index is None:
            sequence, slen = self._limit_sequence(self.data.items(), rlen,
                b_start, b_size)
            result = LazyMap(self.instantiate, sequence, slen,
                actual_result_count=rlen)
        else:
            result = self.sortResults(
                self.data, sort_index, reverse, limit, merge,
                    actual_result_count=rlen, b_start=b_start,
                    b_size=b_size)
            return result
    elif rs:
        # We got some results from the indexes.
        # Sort and convert to sequences.
        # XXX: The check for 'values' is really stupid since we call
        # items() and *not* values()
        rlen = len(rs)
        if sort_index is None and hasattr(rs, 'values'):
            # having a 'values' means we have a data structure with
            # scores.  Build a new result set, sort it by score, reverse
            # it, compute the normalized score, and Lazify it.

            if not merge:
                # Don't bother to sort here, return a list of
                # three tuples to be passed later to mergeResults
                # note that data_record_normalized_score_ cannot be
                # calculated and will always be 1 in this case
                getitem = self.__getitem__
                return [(score, (1, score, rid), getitem)
                        for rid, score in rs.items()]

            rs = rs.byValue(0) # sort it by score
            max = float(rs[0][0])

            # Here we define our getter function inline so that
            # we can conveniently store the max value as a default arg
            # and make the normalized score computation lazy
            def getScoredResult(item, max=max, self=self):
                """
                Returns instances of self._v_brains, or whatever is passed
                into self.useBrains.
                """
                score, key = item
                r=self._v_result_class(self.data[key])\
                      .__of__(self.aq_parent)
                r.data_record_id_ = key
                r.data_record_score_ = score
                r.data_record_normalized_score_ = int(100. * score / max)
                return r

            sequence, slen = self._limit_sequence(rs, rlen, b_start,
                b_size)
            result = LazyMap(getScoredResult, sequence, slen,
                actual_result_count=rlen)

        elif sort_index is None and not hasattr(rs, 'values'):
            # no scores
            if hasattr(rs, 'keys'):
                rs = rs.keys()
            sequence, slen = self._limit_sequence(rs, rlen, b_start,
                b_size)
            result = LazyMap(self.__getitem__, sequence, slen,
                actual_result_count=rlen)
        else:
            # sort.  If there are scores, then this block is not
            # reached, therefore 'sort-on' does not happen in the
            # context of a text index query.  This should probably
            # sort by relevance first, then the 'sort-on' attribute.
            result = self.sortResults(rs, sort_index, reverse, limit,
                merge, actual_result_count=rlen, b_start=b_start,
                b_size=b_size)
    else:
        # Empty result set
        return LazyCat([])
    return result
Example #16
0
    def search(self, request, sort_index=None, reverse=0, limit=None, merge=1):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjuntion with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        rs = None # resultset

        # Indexes fulfill a fairly large contract here. We hand each
        # index the request mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the request, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # For hysterical reasons, if all indexes return None for a given
        # request (and no attributes were used) then we append all results
        # in the Catalog. This generally happens when the search values
        # in request are all empty strings or do not coorespond to any of
        # the indexes.

        # Note that if the indexes find query arguments, but the end result
        # is an empty sequence, we do nothing

        for i in self.indexes.keys():
            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue
            r = _apply_index(request)

            if r is not None:
                r, u = r
                w, rs = weightedIntersection(rs, r)
        
        if rs is None:
            # None of the indexes found anything to do with the request
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            if sort_index is None:
                return LazyMap(self.instantiate, self.data.items(), len(self))
            else:
                return self.sortResults(
                    self.data, sort_index, reverse,  limit, merge)
        elif rs:
            # We got some results from the indexes.
            # Sort and convert to sequences.
            # XXX: The check for 'values' is really stupid since we call
            # items() and *not* values()
            if sort_index is None and hasattr(rs, 'values'):
                # having a 'values' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.
                                
                if not merge:
                    # Don't bother to sort here, return a list of 
                    # three tuples to be passed later to mergeResults
                    # note that data_record_normalized_score_ cannot be
                    # calculated and will always be 1 in this case
                    getitem = self.__getitem__
                    return [(score, (1, score, rid), getitem) 
                            for rid, score in rs.items()]
                
                rs = rs.byValue(0) # sort it by score
                max = float(rs[0][0])

                # Here we define our getter function inline so that
                # we can conveniently store the max value as a default arg
                # and make the normalized score computation lazy
                def getScoredResult(item, max=max, self=self):
                    """
                    Returns instances of self._v_brains, or whatever is passed
                    into self.useBrains.
                    """
                    score, key = item
                    r=self._v_result_class(self.data[key])\
                          .__of__(self.aq_parent)
                    r.data_record_id_ = key
                    r.data_record_score_ = score
                    r.data_record_normalized_score_ = int(100. * score / max)
                    return r
                
                return LazyMap(getScoredResult, rs, len(rs))

            elif sort_index is None and not hasattr(rs, 'values'):
                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
                return LazyMap(self.__getitem__, rs, len(rs))
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
                return self.sortResults(rs, sort_index, reverse, limit, merge)
        else:
            # Empty result set
            return LazyCat([])
Example #17
0
    def count(self, brains, sequence=None):
        """ Intersect results
        """
        res = {}
        # by checking for facet_counts we assume this is a SolrResponse
        # from collective.solr
        if hasattr(brains, 'facet_counts'):
            facet_fields = brains.facet_counts.get('facet_fields')
            if facet_fields:
                index_id = self.data.get('index')
                facet_field = facet_fields.get(index_id, {})
                for value, num in facet_field.items():
                    normalized_value = atdx_normalize(value)
                    if isinstance(value, unicode):
                        res[value] = num
                    elif isinstance(normalized_value, unicode):
                        res[normalized_value] = num
                    else:
                        unicode_value = value.decode('utf-8')
                        res[unicode_value] = num
            else:
                # no facet counts were returned. we exit anyway because
                # zcatalog methods throw an error on solr responses
                return res
            res[""] = res['all'] = len(brains)
            return res
        else:
            # this is handled by the zcatalog. see below
            pass

        if not sequence:
            sequence = [key for key, value in self.vocabulary()]

        if not sequence:
            return res

        index_id = self.data.get('index')
        if not index_id:
            return res

        ctool = getToolByName(self.context, 'portal_catalog')
        index = ctool._catalog.getIndex(index_id)
        ctool = queryUtility(IFacetedCatalog)
        if not ctool:
            return res

        if isinstance(brains, LazyMap):
            values = brains._seq
            # 75384 seq might be a pair of tuples instead of ints
            # if you upgrade to ZCatalog 3
            if isinstance(values[0], tuple):
                values = [v[1] for v in values]
            brains = IISet(values)
        else:
            brains = IISet(brain.getRID() for brain in brains)

        res[""] = res['all'] = len(brains)
        for value in sequence:
            if not value:
                res[value] = len(brains)
                continue
            normalized_value = atdx_normalize(value)
            rset = ctool.apply_index(self.context, index, normalized_value)[0]
            rset = IISet(rset)
            rset = weightedIntersection(brains, rset)[1]
            if isinstance(value, unicode):
                res[value] = len(rset)
            elif isinstance(normalized_value, unicode):
                res[normalized_value] = len(rset)
            else:
                unicode_value = value.decode('utf-8')
                res[unicode_value] = len(rset)
        return res