Esempio n. 1
0
    def test_heavy_end(self):
        bigsize = BIGSETSIZE
        smallsize = SMALLSETSIZE

        small = IISet(xrange(bigsize - smallsize, bigsize))
        large = IITreeSet(xrange(smallsize))
        self.timing(small, large,
                    'Intersection small set high values + small treeset')
        self.timing(large, small,
                    'Intersection small treeset + small set high values')

        small = IISet(xrange(bigsize - smallsize, bigsize))
        large = IITreeSet(xrange(bigsize))
        self.timing(small, large,
                    'Intersection small set high values + large treeset')
        self.timing(large, small,
                    'Intersection large treeset + small set high values')

        small = IISet(xrange(bigsize - smallsize, bigsize))
        large = IISet(xrange(bigsize))
        self.timing(small, large,
                    'Intersection small set high values + large set')
        self.timing(large, small,
                    '\nIntersection large set + small set high values')
Esempio n. 2
0
    def testLargerInputs(self):
        from random import randint
        MAXSIZE = 200
        MAXVAL = 400
        for i in range(3):
            n = randint(0, MAXSIZE)
            Akeys = [randint(1, MAXVAL) for j in range(n)]
            As = [makeset(Akeys) for makeset in self.builders]
            Akeys = IISet(Akeys)

            n = randint(0, MAXSIZE)
            Bkeys = [randint(1, MAXVAL) for j in range(n)]
            Bs = [makeset(Bkeys) for makeset in self.builders]
            Bkeys = IISet(Bkeys)

            for op, simulator in ((self.union, self._union),
                                  (self.intersection, self._intersection),
                                  (self.difference, self._difference)):
                for A in As:
                    for B in Bs:
                        got = op(A, B)
                        want = simulator(Akeys, Bkeys)
                        self.assertEqual(list(got), want,
                                         (A, B, Akeys, Bkeys, list(got), want))
Esempio n. 3
0
    def register(self, hrefs, referer, timestamp):
        """Add or update link presence information.

        If a link has not been checked since the provided timestamp,
        it will be added to the queue (or if it is not in the
        database).
        """

        referer = self.index.get(referer) or self.store(referer)

        registry = getUtility(IRegistry, context=self.aq_parent)
        try:
            settings = registry.forInterface(ISettings)
        except KeyError as exc:
            logger.warn(exc)
            return

        limit = settings.referers

        for href in hrefs:
            if self.should_ignore(href, settings.ignore_list):
                continue

            # If the hyperlink is not already in the work queue,
            # compare the provided timestamp to our database to see if
            # we need to check its validity. Note that internal links
            # are excempt if we're not using the publisher.
            index = self.index.get(href)
            entry = self.checked.get(-1 if index is None else index)

            if index not in self.queue:
                if entry is None or entry[0] < timestamp:
                    if settings.use_publisher or not href.startswith('/'):
                        index = self.enqueue(href)
                    elif href not in self.index:
                        index = self.store(href)

            assert index is not None

            if entry is None:
                self.checked[index] = None, None, IISet((referer, ))
            else:
                # If the provided paths are a subset of the already
                # seen paths, and if there is no new referer, we don't
                # issue an update.
                referers = entry[2]
                if referer not in referers and len(referers) <= limit:
                    referers.add(referer)
Esempio n. 4
0
    def test_depth_limit_resultset(self):
        self._populateIndex()
        resultset = IISet([1, 2, 3, 4, 8, 16])
        tests = [
            # depth, expected result
            (1, [1, 8, 16]),
            (2, [1, 2, 8, 16]),
            (3, [1, 2, 3, 8, 16]),
        ]

        for depth, results in tests:
            res = self._index._apply_index(
                dict(path=dict(query='/', depth=depth)), resultset=resultset)
            combined = intersection(res[0], resultset)
            lst = list(combined)
            self.assertEqual(lst, results)
Esempio n. 5
0
    def _apply_index(self, request, cid=''):
        """Apply the index to the search parameters given in request"""

        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        result = None
        fallback = self.fallback
        if hasattr(record, 'fallback'):
            fallback = bool(record.fallback)

        for language in record.keys:
            rows = self._search(language, fallback)
            result = ii_union(result, rows)

        return (result or IISet()), ('Language', )
Esempio n. 6
0
    def _search(self, language, fallback=True):
        main, sub = splitLanguage(language)

        if main not in self._index:
            return None

        if fallback:
            # Search in sorted order, specific sub tag first, None second
            subs = list(self._index[main].keys())
            subs.sort()
            if sub in subs:
                subs.remove(sub)
                subs.insert(0, sub)
        else:
            subs = [sub]

        result = OOSet()

        for sublanguage in subs:
            result = oo_union(result, self._index[main][sublanguage])

        return IISet(entry.docid for entry in result)
def languageindex_apply_index(self, request, cid='', res=None):
    """Apply the index to the search parameters given in request"""

    record = parseIndexRequest(request, self.id, self.query_options)
    if record.keys is None:
        return None

    result = None
    fallback = self.fallback
    if hasattr(record, 'fallback'):
        fallback = bool(record.fallback)

    # TODO: This could be optimized to avoid a loop per language
    # As we most often get a language code and '' for language neutral this
    # could be beneficial. If the site has no language neutral content, the
    # first check "main not in self._index" will return None. The union of
    # None with the resultset is a cheap call, so we don't care right now.
    for language in record.keys:
        rows = self._search(language, fallback, res=res)
        result = ii_union(result, rows)

    return (result or IISet()), ('Language', )
Esempio n. 8
0
    def query_index(self, record, resultset=None):
        """See IPluggableIndex.

        o Unpacks record from catalog and map onto '_search'.
        """
        level = record.get('level', 0)
        operator = record.operator

        # depending on the operator we use intersection or union
        if operator == 'or':
            set_func = union
        else:
            set_func = intersection

        res = None
        for k in record.keys:
            rows = self._search(k, level)
            res = set_func(res, rows)

        if res:
            return res
        return IISet()
Esempio n. 9
0
    def update(self, href, status):
        """Update link status."""
        now = datetime.datetime.now()
        timestamp = int(time.mktime(now.timetuple()))

        index = self.index.get(href)
        if index is None:
            return

        entry = self.checked.get(-1 if index is None else index)
        if entry is None:
            self.checked[index] = timestamp, status, IISet()

        # If the status changed, we update the entry.
        elif status != entry[1] or not entry[0]:

            # If the status was previously good, then we clear the
            # status. What this means is that we'll wait for the next
            # check to declare a bad status (it might be temporary).
            if entry[1] == 200:
                status = None

            self.checked[index] = timestamp, status, entry[2]
Esempio n. 10
0
    def _apply_index(self, request, resultset=None):
        """ hook for (Z)Catalog
            'request' --  mapping type (usually {"path": "..." }
             additionaly a parameter "path_level" might be passed
             to specify the level (see search())
        """

        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys == None:
            return None

        level = record.get("level", 0)
        operator = record.get('operator', self.useOperator).lower()
        depth = getattr(record, 'depth', -1)  # use getattr to get 0 value
        navtree = record.get('navtree', 0)
        navtree_start = record.get('navtree_start', 0)

        # depending on the operator we use intersection of union
        if operator == "or":
            set_func = union
        else:
            set_func = intersection

        result = None
        for k in record.keys:
            rows = self.search(k,
                               level,
                               depth,
                               navtree,
                               navtree_start,
                               resultset=resultset)
            result = set_func(result, rows)

        if result:
            return (result, (self.id, ))
        else:
            return (IISet(), (self.id, ))
Esempio n. 11
0
    def assignWordId(self, word):
        """Assigns a new word id to the provided word, and return it."""

        # Double check it's not in the lexicon already, and if it is, just
        # return it.
        if self._lexicon.has_key(word):
            return self._lexicon[word]

        # Get word id. BBB Backward compat pain.
        inverse = self._inverseLex
        try:
            insert = inverse.insert
        except AttributeError:
            # we have an "old" BTree object
            if inverse:
                wid = inverse.keys()[-1] + 1
            else:
                self._inverseLex = IOBTree()
                wid = 1
            inverse[wid] = word
        else:
            # we have a "new" IOBTree object
            wid = randid()
            while not inverse.insert(wid, word):
                wid = randid()

        self._lexicon[word] = wid

        # Now take all the digrams and insert them into the digram map.
        for digram in self.createDigrams(word):
            set = self._digrams.get(digram, None)
            if set is None:
                self._digrams[digram] = set = IISet()
            set.insert(wid)

        return wid
Esempio n. 12
0
    def query_index(self, record, resultset=None):
        """Search the index with the given IndexQuery object.

        If not `None`, the resultset argument
        indicates that the search result is relevant only on this set,
        i.e. everything outside resultset is of no importance.
        The index can use this information for optimizations.
        """
        index = self._index
        r = None
        opr = None

        # not / exclude parameter
        not_parm = record.get('not', None)

        operator = record.operator

        cachekey = None
        cache = self.getRequestCache()
        if cache is not None:
            cachekey = self.getRequestCacheKey(record)
            if cachekey is not None:
                cached = None
                if operator == 'or':
                    cached = cache.get(cachekey, None)
                else:
                    cached_setlist = cache.get(cachekey, None)
                    if cached_setlist is not None:
                        r = resultset
                        for s in cached_setlist:
                            # the result is bound by the resultset
                            r = intersection(r, s)
                            # If intersection, we can't possibly get a
                            # smaller result
                            if not r:
                                break
                        cached = r

                if cached is not None:
                    if isinstance(cached, int):
                        cached = IISet((cached, ))

                    if not_parm:
                        not_parm = list(map(self._convert, not_parm))
                        exclude = self._apply_not(not_parm, resultset)
                        cached = difference(cached, exclude)

                    return cached

        if not record.keys and not_parm:
            # convert into indexed format
            not_parm = list(map(self._convert, not_parm))
            # we have only a 'not' query
            record.keys = [k for k in index.keys() if k not in not_parm]
        else:
            # convert query arguments into indexed format
            record.keys = list(map(self._convert, record.keys))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = 'range'
            opr_args = []
            if range_parm.find('min') > -1:
                opr_args.append('min')
            if range_parm.find('max') > -1:
                opr_args.append('max')

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == 'range':  # range search
            if 'min' in opr_args:
                lo = min(record.keys)
            else:
                lo = None
            if 'max' in opr_args:
                hi = max(record.keys)
            else:
                hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result,))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s,))
                    tmp.append(s)
                r = multiunion(tmp)

                if cachekey is not None:
                    cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s,))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp

                # 'r' is not invariant of resultset. Thus, we
                # have to remember 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                if k is None:
                    # Prevent None from being looked up. None doesn't
                    # have a valid ordering definition compared to any
                    # other object. BTrees 4.0+ will throw a TypeError
                    # "object has default comparison".
                    continue
                try:
                    s = index.get(k, None)
                except TypeError:
                    # key is not valid for this Btree so the value is None
                    LOG.error(
                        '%(context)s: query_index tried '
                        'to look up key %(key)r from index %(index)r '
                        'but key was of the wrong type.', dict(
                            context=self.__class__.__name__,
                            key=k,
                            index=self.id,
                        )
                    )
                    s = None
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    if cachekey is not None:
                        # If operator is 'and', we have to cache a list of
                        # IISet objects
                        cache[cachekey] = [IISet()]
                    return IISet()
                elif isinstance(s, int):
                    s = IISet((s,))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result,))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.

                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)

                    # 'r' is not invariant of resultset.  Thus, we
                    # have to remember the union of 'setlist'. But
                    # this is maybe a performance killer. So we do not cache.
                    # if cachekey is not None:
                    #    cache[cachekey] = multiunion(setlist)

                else:
                    r = multiunion(setlist)
                    if cachekey is not None:
                        cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)

                # 'r' is not invariant of resultset. Thus, we
                # have to remember the union of 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet()
        if not_parm:
            exclude = self._apply_not(not_parm, resultset)
            r = difference(r, exclude)
        return r
Esempio n. 13
0
    def query_index(self, record, resultset=None):
        """Search the index with the given IndexQuery object.

        If the query has a key which matches the 'id' of
        the index instance, one of a few things can happen:

          - if the value is a string, turn the value into
            a single-element sequence, and proceed.

          - if the value is a sequence, return a union search.

          - If the value is a dict and contains a key of the form
            '<index>_operator' this overrides the default method
            ('or') to combine search results. Valid values are 'or'
            and 'and'.
        """
        index = self._index
        r = None
        opr = None

        # not / exclude parameter
        not_parm = record.get('not', None)

        operator = record.operator

        cachekey = None
        cache = self.getRequestCache()
        if cache is not None:
            cachekey = self.getRequestCacheKey(record)
            if cachekey is not None:
                cached = None
                if operator == 'or':
                    cached = cache.get(cachekey, None)
                else:
                    cached_setlist = cache.get(cachekey, None)
                    if cached_setlist is not None:
                        r = resultset
                        for s in cached_setlist:
                            # the result is bound by the resultset
                            r = intersection(r, s)
                            # If intersection, we can't possibly get a
                            # smaller result
                            if not r:
                                break
                        cached = r

                if cached is not None:
                    if isinstance(cached, int):
                        cached = IISet((cached, ))

                    if not_parm:
                        not_parm = list(map(self._convert, not_parm))
                        exclude = self._apply_not(not_parm, resultset)
                        cached = difference(cached, exclude)

                    return cached

        if not record.keys and not_parm:
            # convert into indexed format
            not_parm = list(map(self._convert, not_parm))
            # we have only a 'not' query
            record.keys = [k for k in index.keys() if k not in not_parm]
        else:
            # convert query arguments into indexed format
            record.keys = list(map(self._convert, record.keys))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = "range"
            opr_args = []
            if range_parm.find("min") > -1:
                opr_args.append("min")
            if range_parm.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args:
                lo = min(record.keys)
            else:
                lo = None
            if 'max' in opr_args:
                hi = max(record.keys)
            else:
                hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                r = multiunion(tmp)

                if cachekey is not None:
                    cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp

                # 'r' is not invariant of resultset. Thus, we
                # have to remember 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                if k is None:
                    # Prevent None from being looked up. None doesn't
                    # have a valid ordering definition compared to any
                    # other object. BTrees 4.0+ will throw a TypeError
                    # "object has default comparison".
                    continue
                s = index.get(k, None)
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    if cachekey is not None:
                        # If operator is 'and', we have to cache a list of
                        # IISet objects
                        cache[cachekey] = [IISet()]
                    return IISet()
                elif isinstance(s, int):
                    s = IISet((s, ))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))

                if cachekey is not None:
                    if operator == 'or':
                        cache[cachekey] = result
                    else:
                        cache[cachekey] = [result]

                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.

                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)

                    # 'r' is not invariant of resultset.  Thus, we
                    # have to remember the union of 'setlist'. But
                    # this is maybe a performance killer. So we do not cache.
                    # if cachekey is not None:
                    #    cache[cachekey] = multiunion(setlist)

                else:
                    r = multiunion(setlist)
                    if cachekey is not None:
                        cache[cachekey] = r
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)

                # 'r' is not invariant of resultset. Thus, we
                # have to remember the union of 'setlist'
                if cachekey is not None:
                    cache[cachekey] = setlist

                r = resultset
                for s in setlist:
                    r = intersection(r, s)
                    # If intersection, we can't possibly get a smaller result
                    if not r:
                        break

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet()
        if not_parm:
            exclude = self._apply_not(not_parm, resultset)
            r = difference(r, exclude)
        return r
Esempio n. 14
0
    def test_search_inputresult(self):
        index = self._makeOne()
        obj = Dummy(1, True)
        index._index_object(obj.id, obj, attr='truth')
        obj = Dummy(2, False)
        index._index_object(obj.id, obj, attr='truth')
        obj = Dummy(3, True)
        index._index_object(obj.id, obj, attr='truth')

        # The less common value is indexed
        self.assertEqual(index._index_value, 0)

        res, idx = index._apply_index({'truth': True}, resultset=IISet([]))
        self.assertEqual(idx, ('truth', ))
        self.assertEqual(list(res), [])

        res, idx = index._apply_index({'truth': False}, resultset=IISet([]))
        self.assertEqual(idx, ('truth', ))
        self.assertEqual(list(res), [])

        res, idx = index._apply_index({'truth': True}, resultset=IISet([1]))
        self.assertEqual(list(res), [1])

        res, idx = index._apply_index({'truth': False}, resultset=IISet([1]))
        self.assertEqual(list(res), [])

        res, idx = index._apply_index({'truth': True}, resultset=IISet([2]))
        self.assertEqual(list(res), [])

        res, idx = index._apply_index({'truth': False}, resultset=IISet([2]))
        self.assertEqual(list(res), [2])

        res, idx = index._apply_index({'truth': True},
                                      resultset=IISet([1, 2]))
        self.assertEqual(list(res), [1])

        res, idx = index._apply_index({'truth': False},
                                      resultset=IISet([1, 2]))
        self.assertEqual(list(res), [2])

        res, idx = index._apply_index({'truth': True},
                                      resultset=IISet([1, 3]))
        self.assertEqual(list(res), [1, 3])

        res, idx = index._apply_index({'truth': False},
                                      resultset=IISet([1, 3]))
        self.assertEqual(list(res), [])

        res, idx = index._apply_index({'truth': True},
                                      resultset=IISet([1, 2, 3]))
        self.assertEqual(list(res), [1, 3])

        res, idx = index._apply_index({'truth': False},
                                      resultset=IISet([1, 2, 3]))
        self.assertEqual(list(res), [2])

        res, idx = index._apply_index({'truth': True},
                                      resultset=IISet([1, 2, 99]))
        self.assertEqual(list(res), [1])

        res, idx = index._apply_index({'truth': False},
                                      resultset=IISet([1, 2, 99]))
        self.assertEqual(list(res), [2])
Esempio n. 15
0
    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in the argument

        Normalize the 'query' arguments into integer values at minute
        precision before querying.
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        keys = map(self._convert, record.keys)

        index = self._index
        r = None
        opr = None

        #experimental code for specifing the operator
        operator = record.get('operator', self.useOperator)
        if not operator in self.operators:
            raise RuntimeError("operator not valid: %s" % operator)

        # depending on the operator we use intersection or union
        if operator == "or":
            set_func = union
        else:
            set_func = intersection

        # range parameter
        range_arg = record.get('range', None)
        if range_arg:
            opr = "range"
            opr_args = []
            if range_arg.find("min") > -1:
                opr_args.append("min")
            if range_arg.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args:
                lo = min(keys)
            else:
                lo = None

            if 'max' in opr_args:
                hi = max(keys)
            else:
                hi = None

            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            r = multiunion(setlist)

        else:  # not a range search
            for key in keys:
                set = index.get(key, None)
                if set is not None:
                    if isinstance(set, int):
                        set = IISet((set, ))
                    else:
                        # set can't be bigger than resultset
                        set = intersection(set, resultset)
                    r = set_func(r, set)

        if isinstance(r, int):
            r = IISet((r, ))

        if r is None:
            return IISet(), (self.id, )
        else:
            return r, (self.id, )
Esempio n. 16
0
    def sortResults(self,
                    rs,
                    sort_index,
                    reverse=False,
                    limit=None,
                    merge=True,
                    actual_result_count=None,
                    b_start=0,
                    b_size=None):
        # Sort a result set using one or more sort indexes. Both sort_index
        # and reverse can be lists of indexes and reverse specifications.
        # Return a lazy result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples, where
        # sortkey can be a tuple on its own.
        second_indexes = None
        second_indexes_key_map = None
        sort_index_length = 1
        if isinstance(sort_index, list):
            sort_index_length = len(sort_index)
            if sort_index_length > 1:
                second_indexes = sort_index[1:]
                second_indexes_key_map = []
                for si in second_indexes:
                    second_indexes_key_map.append(si.documentToKeyMap())
            sort_index = sort_index[0]
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        result = []
        r_append = result.append
        r_insert = result.insert
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the result set, reverse sorting
        # order and limit it, then reverse the result set again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            if isinstance(reverse, list):
                reverse = [not r for r in reverse]
            else:
                reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = rlen - b_end
            limit = b_start + b_size

        # determine sort_spec
        if isinstance(reverse, list):
            sort_spec = [r and -1 or 1 for r in reverse]
            # limit to current maximum of sort indexes
            sort_spec = sort_spec[:sort_index_length]
            # use first sort order for choosing the algorithm
            first_reverse = reverse[0]
        else:
            sort_spec = []
            for i in xrange(sort_index_length):
                sort_spec.append(reverse and -1 or 1)
            first_reverse = reverse

        if merge and (rlen > (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # TODO: len(sort_index) isn't actually what we want for a keyword
            # index, as it's only the unique values, not the documents.
            length = 0
            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            if sort_index_length == 1:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        r_append((k, intset, _self__getitem__))
                result.sort(reverse=reverse)
            else:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        # sort on secondary index
                        keysets = defaultdict(list)
                        for i in intset:
                            full_key = (k, )
                            for km in second_indexes_key_map:
                                try:
                                    full_key += (km[i], )
                                except KeyError:
                                    pass
                            keysets[full_key].append(i)
                        for k2, v2 in keysets.items():
                            r_append((k2, v2, _self__getitem__))
                result = multisort(result, sort_spec)
            sequence, slen = self._limit_sequence(result, length, b_start,
                                                  b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index.
            # If we are interested in at least 25% or more of the result set,
            # the N-Best algorithm is slower, so we iterate over all.
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        # The reference back to __getitem__ is used in case
                        # we do not merge now and need to intermingle the
                        # results with those of other catalogs while avoiding
                        # the cost of instantiating a LazyMap per result
                        r_append((key, did, _self__getitem__))
                if merge:
                    result.sort(reverse=reverse)
            else:
                for did in rs:
                    try:
                        full_key = (index_key_map[did], )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        r_append((full_key, did, _self__getitem__))
                if merge:
                    result = multisort(result, sort_spec)
            if merge:
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence
        elif first_reverse:
            # Limit / sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            k_insert = keys.insert
            n = 0
            worst = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result.reverse()
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                               switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence
        elif not first_reverse:
            # Limit / sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            k_insert = keys.insert
            n = 0
            best = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        pass
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                               switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence

        return LazyMap(self.__getitem__,
                       result,
                       len(result),
                       actual_result_count=actual_result_count)
Esempio n. 17
0
def checkCatalog(path, indexes):
    """ perform some consistency checks on a ZCatalog instance"""

    root = Zope2.app()

    try:
        catalog = root.unrestrictedTraverse(path)
    except AttributeError:
        print 'Error: catalog object not found'
        sys.exit(1)

    # get Catalog instance
    _cat = catalog._catalog

    # check Catalog internal BTrees
    l_data = list(_cat.data.keys())
    l_data.sort()
    l_uids = list(_cat.uids.values())
    l_uids.sort()
    l_paths = list(_cat.data.keys())
    l_paths.sort()

    print "Checking catalog internal BTrees"
    print "\tINFO: Mapping data:  %d entries" % len(l_data)
    print "\tINFO: Mapping uids:  %d entries" % len(l_uids)
    print "\tINFO: Mapping paths: %d entries" % len(l_paths)

    if l_data == l_uids:
        print "\tOK:  Mapping data equals Mapping uids"
    else:
        print "\tERR: Mapping data does not equal Mapping uids"

    if l_data == l_paths:
        print "\tOK:  Mapping data equals Maaping paths"
    else:
        print "\tERR: Mapping data does not equal Maaping paths"

    # check BTrees of indexes

    for id, idx in _cat.indexes.items():

        if indexes and not idx.meta_type in indexes: continue

        print "Checking index '%s' (type: %s)" % (id, idx.meta_type)

        if idx.meta_type in ['FieldIndex', 'KeywordIndex']:

            # check forward entries
            RIDS = IISet()
            for key, rids in idx._index.items():
                if isinstance(rids, IntType):
                    RIDS.insert(rids)
                else:
                    map(RIDS.insert, rids.keys())

            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff) != 0:
                print '\tERR: Problem with forward entries'
                print '\tERR: too much forward entries:', diff
            else:
                print '\tOK:  Forward entries (%d entries)' % (len(RIDS))

        elif idx.meta_type in ['PathIndex']:

            RIDS = IISet()

            for rids in map(None, idx._index.values()):
                map(RIDS.insert, rids.values()[0])

            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff) != 0:
                print '\tERR: Problem with forward entries'
                print '\tERR: too much forward entries:', diff
            else:
                print '\tOK:  Forward entries (%d entries)' % (len(RIDS))

        if idx.meta_type in ['FieldIndex', 'KeywordIndex', 'PathIndex']:

            # check backward entries
            RIDS = IISet(idx._unindex.keys())
            diff = difference(RIDS, IISet(_cat.data.keys()))
            if len(diff) != 0:
                print '\tERR: Problem with backward entries'
                print '\tERR: too much backward entries:', diff
            else:
                print '\tOK:  Backward entries (%d entries)' % (len(RIDS))
Esempio n. 18
0
    def search(self,
               path,
               default_level=0,
               depth=-1,
               navtree=0,
               navtree_start=0):
        """
        path is either a string representing a
        relative URL or a part of a relative URL or
        a tuple (path,level).

        level >= 0  starts searching at the given level
        level <  0  not implemented yet
        """

        if isinstance(path, basestring):
            startlevel = default_level
        else:
            startlevel = int(path[1])
            path = path[0]

        absolute_path = isinstance(path, basestring) and path.startswith('/')
        comps = filter(None, path.split('/'))

        orig_comps = [''] + comps[:]

        if depth > 0:
            raise ValueError("Can't do depth searches anymore")
        if not comps:
            comps = ['dmd']
            startlevel = 1
        else:
            if comps[0] == getCSEConf().get('virtualroot',
                                            '').replace('/', ''):
                comps = comps[1:]
            if comps[0] == 'zport':
                comps = comps[1:]

        if comps[0] != 'dmd':
            raise ValueError("Depth searches must start with 'dmd'")
        startlevel = len(comps)

        if len(comps) == 0:
            if depth == -1 and not navtree:
                return IISet(self._unindex.keys())

        # Make sure that we get depth = 1 if in navtree mode
        # unless specified otherwise

        orig_depth = depth
        if depth == -1:
            depth = 0 or navtree

        # Optimized navtree starting with absolute path
        if absolute_path and navtree and depth == 1 and default_level == 0:
            set_list = []
            # Insert root element
            if navtree_start >= len(orig_comps):
                navtree_start = 0
            # create a set of parent paths to search
            for i in range(len(orig_comps), navtree_start, -1):
                parent_path = '/'.join(orig_comps[:i])
                parent_path = parent_path and parent_path or '/'
                try:
                    set_list.append(self._index_parents[parent_path])
                except KeyError:
                    pass
            return multiunion(set_list)
        # Optimized breadcrumbs
        elif absolute_path and navtree and depth == 0 and default_level == 0:
            item_list = IISet()
            # Insert root element
            if navtree_start >= len(orig_comps):
                navtree_start = 0
            # create a set of parent paths to search
            for i in range(len(orig_comps), navtree_start, -1):
                parent_path = '/'.join(orig_comps[:i])
                parent_path = parent_path and parent_path or '/'
                try:
                    item_list.insert(self._index_items[parent_path])
                except KeyError:
                    pass
            return item_list
        # Specific object search
        elif absolute_path and orig_depth == 0 and default_level == 0:
            try:
                return IISet([self._index_items[path]])
            except KeyError:
                return IISet()
        # Single depth search
        elif absolute_path and orig_depth == 1 and default_level == 0:
            # only get objects contained in requested folder
            try:
                return self._index_parents[path]
            except KeyError:
                return IISet()
        # Sitemaps, relative paths, and depth queries
        elif startlevel >= 0:

            pathset = None  # Same as pathindex
            navset = None  # For collecting siblings along the way
            depthset = None  # For limiting depth

            if navtree and depth and \
                   self._index.has_key(None) and \
                   self._index[None].has_key(startlevel):
                navset = self._index[None][startlevel]
            for level in range(startlevel, startlevel + len(comps)):
                if level <= len(comps):
                    comp = "/".join(comps[:level])
                    if (not self._index.has_key(comp)
                            or not self._index[comp].has_key(level)):
                        # Navtree is inverse, keep going even for
                        # nonexisting paths
                        if navtree:
                            pathset = IISet()
                        else:
                            return IISet()
                    else:
                        return self._index[comp][level]
                    if navtree and depth and \
                           self._index.has_key(None) and \
                           self._index[None].has_key(level+depth):
                        navset = union(
                            navset,
                            intersection(pathset,
                                         self._index[None][level + depth]))
                if level - startlevel >= len(comps) or navtree:
                    if (self._index.has_key(None)
                            and self._index[None].has_key(level)):
                        depthset = union(
                            depthset,
                            intersection(pathset, self._index[None][level]))

            if navtree:
                return union(depthset, navset) or IISet()
            elif depth:
                return depthset or IISet()
            else:
                return pathset or IISet()

        else:
            results = IISet()
            for level in range(0, self._depth + 1):
                ids = None
                error = 0
                for cn in range(0, len(comps)):
                    comp = comps[cn]
                    try:
                        ids = intersection(ids, self._index[comp][level + cn])
                    except KeyError:
                        error = 1
                if error == 0:
                    results = union(results, ids)
            return results
Esempio n. 19
0
    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in the request arg.

        The request argument should be a mapping object.

        If the request does not have a key which matches the "id" of
        the index instance, then None is returned.

        If the request *does* have a key which matches the "id" of
        the index instance, one of a few things can happen:

          - if the value is a blank string, None is returned (in
            order to support requests from web forms where
            you can't tell a blank string from empty).

          - if the value is a nonblank string, turn the value into
            a single-element sequence, and proceed.

          - if the value is a sequence, return a union search.

          - If the value is a dict and contains a key of the form
            '<index>_operator' this overrides the default method
            ('or') to combine search results. Valid values are "or"
            and "and".

        If None is not returned as a result of the abovementioned
        constraints, two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.

        FAQ answer:  to search a Field Index for documents that
        have a blank string as their value, wrap the request value
        up in a tuple ala: request = {'id':('',)}
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        index = self._index
        r = None
        opr = None

        # not / exclude parameter
        not_parm = record.get('not', None)
        if not record.keys and not_parm:
            # convert into indexed format
            not_parm = map(self._convert, not_parm)
            # we have only a 'not' query
            record.keys = [k for k in index.keys() if k not in not_parm]
        else:
            # convert query arguments into indexed format
            record.keys = map(self._convert, record.keys)

        # experimental code for specifing the operator
        operator = record.get('operator', self.useOperator)
        if not operator in self.operators:
            raise RuntimeError("operator not valid: %s" % escape(operator))

        # Range parameter
        range_parm = record.get('range', None)
        if range_parm:
            opr = "range"
            opr_args = []
            if range_parm.find("min") > -1:
                opr_args.append("min")
            if range_parm.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args:
                lo = min(record.keys)
            else:
                lo = None
            if 'max' in opr_args:
                hi = max(record.keys)
            else:
                hi = None
            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            # If we only use one key, intersect and return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))
                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result, (self.id, )

            if operator == 'or':
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                r = multiunion(tmp)
            else:
                # For intersection, sort with smallest data set first
                tmp = []
                for s in setlist:
                    if isinstance(s, int):
                        s = IISet((s, ))
                    tmp.append(s)
                if len(tmp) > 2:
                    setlist = sorted(tmp, key=len)
                else:
                    setlist = tmp
                r = resultset
                for s in setlist:
                    # the result is bound by the resultset
                    r = intersection(r, s)

        else:  # not a range search
            # Filter duplicates
            setlist = []
            for k in record.keys:
                s = index.get(k, None)
                # If None, try to bail early
                if s is None:
                    if operator == 'or':
                        # If union, we can't possibly get a bigger result
                        continue
                    # If intersection, we can't possibly get a smaller result
                    return IISet(), (self.id, )
                elif isinstance(s, int):
                    s = IISet((s, ))
                setlist.append(s)

            # If we only use one key return immediately
            if len(setlist) == 1:
                result = setlist[0]
                if isinstance(result, int):
                    result = IISet((result, ))
                if not_parm:
                    exclude = self._apply_not(not_parm, resultset)
                    result = difference(result, exclude)
                return result, (self.id, )

            if operator == 'or':
                # If we already get a small result set passed in, intersecting
                # the various indexes with it and doing the union later is
                # faster than creating a multiunion first.
                if resultset is not None and len(resultset) < 200:
                    smalllist = []
                    for s in setlist:
                        smalllist.append(intersection(resultset, s))
                    r = multiunion(smalllist)
                else:
                    r = multiunion(setlist)
            else:
                # For intersection, sort with smallest data set first
                if len(setlist) > 2:
                    setlist = sorted(setlist, key=len)
                r = resultset
                for s in setlist:
                    r = intersection(r, s)

        if isinstance(r, int):
            r = IISet((r, ))
        if r is None:
            return IISet(), (self.id, )
        if not_parm:
            exclude = self._apply_not(not_parm, resultset)
            r = difference(r, exclude)
        return r, (self.id, )
Esempio n. 20
0
    def count(self, brains, sequence=None):
        """ Intersect results
        """
        res = {}
        # by checking for facet_counts we assume this is a SolrResponse
        # from collective.solr
        if hasattr(brains, 'facet_counts'):
            facet_fields = brains.facet_counts.get('facet_fields')
            if facet_fields:
                index_id = self.data.get('index')
                facet_field = facet_fields.get(index_id, {})
                for value, num in facet_field.items():
                    normalized_value = atdx_normalize(value)
                    if isinstance(value, unicode):
                        res[value] = num
                    elif isinstance(normalized_value, unicode):
                        res[normalized_value] = num
                    else:
                        unicode_value = value.decode('utf-8')
                        res[unicode_value] = num
            else:
                # no facet counts were returned. we exit anyway because
                # zcatalog methods throw an error on solr responses
                return res
            res[""] = res['all'] = len(brains)
            return res
        else:
            # this is handled by the zcatalog. see below
            pass

        if not sequence:
            sequence = [key for key, value in self.vocabulary()]

        if not sequence:
            return res

        index_id = self.data.get('index')
        if not index_id:
            return res

        ctool = getToolByName(self.context, 'portal_catalog')
        index = ctool._catalog.getIndex(index_id)
        ctool = queryUtility(IFacetedCatalog)
        if not ctool:
            return res

        if isinstance(brains, LazyMap):
            values = brains._seq
            # 75384 seq might be a pair of tuples instead of ints
            # if you upgrade to ZCatalog 3
            if isinstance(values[0], tuple):
                values = [v[1] for v in values]
            brains = IISet(values)
        else:
            brains = IISet(brain.getRID() for brain in brains)

        res[""] = res['all'] = len(brains)
        for value in sequence:
            if not value:
                res[value] = len(brains)
                continue
            normalized_value = atdx_normalize(value)
            rset = ctool.apply_index(self.context, index, normalized_value)[0]
            rset = IISet(rset)
            rset = weightedIntersection(brains, rset)[1]
            if isinstance(value, unicode):
                res[value] = len(rset)
            elif isinstance(normalized_value, unicode):
                res[normalized_value] = len(rset)
            else:
                unicode_value = value.decode('utf-8')
                res[unicode_value] = len(rset)
        return res
Esempio n. 21
0
 def _makeOne(self):
     from BTrees.IIBTree import IISet
     return IISet()
def dateindex_apply_index( self, request, cid='', type=type, res=None):
    record = parseIndexRequest( request, self.id, self.query_options )
    if record.keys == None:
        return None

    keys = map( self._convert, record.keys )

    index = self._index
    r = None
    opr = None

    #experimental code for specifing the operator
    operator = record.get( 'operator', self.useOperator )
    if not operator in self.operators :
        raise RuntimeError, "operator not valid: %s" % operator

    # depending on the operator we use intersection or union
    if operator=="or":
        set_func = union
    else:
        set_func = intersection

    # range parameter
    range_arg = record.get('range',None)
    if range_arg:
        opr = "range"
        opr_args = []
        if range_arg.find("min") > -1:
            opr_args.append("min")
        if range_arg.find("max") > -1:
            opr_args.append("max")

    if record.get('usage',None):
        # see if any usage params are sent to field
        opr = record.usage.lower().split(':')
        opr, opr_args = opr[0], opr[1:]

    if opr=="range":   # range search
        if 'min' in opr_args:
            lo = min(keys)
        else:
            lo = None

        if 'max' in opr_args:
            hi = max(keys)
        else:
            hi = None

        if hi:
            setlist = index.values(lo,hi)
        else:
            setlist = index.values(lo)

        #for k, set in setlist:
            #if type(set) is IntType:
                #set = IISet((set,))
            #r = set_func(r, set)
        # XXX: Use multiunion!
        r = multiunion(setlist)

    else: # not a range search
        for key in keys:
            set = index.get(key, None)
            if set is not None:
                if isinstance(set, int):
                    set = IISet((set,))
                else:
                    # set can't be bigger than res
                    set = intersection(set, res)
                r = set_func(r, set)

    if isinstance(r, int):
        r = IISet((r,))

    if r is None:
        return IISet(), (self.id,)
    else:
        return r, (self.id,)
Esempio n. 23
0
    def _apply_index(self, request, cid='', type=type):
        """Apply the index to query parameters given in the request arg.

        The request argument should be a mapping object.

        If the request does not have a key which matches the "id" of
        the index instance, then None is returned.

        If the request *does* have a key which matches the "id" of
        the index instance, one of a few things can happen:

          - if the value is a blank string, None is returned (in
            order to support requests from web forms where
            you can't tell a blank string from empty).

          - if the value is a nonblank string, turn the value into
            a single-element sequence, and proceed.

          - if the value is a sequence, return a union search.

        If the request contains a parameter with the name of the
        column + '_usage', it is sniffed for information on how to
        handle applying the index.

        If the request contains a parameter with the name of the
        column = '_operator' this overrides the default method
        ('or') to combine search results. Valid values are "or"
        and "and".

        If None is not returned as a result of the abovementioned
        constraints, two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.

        FAQ answer:  to search a Field Index for documents that
        have a blank string as their value, wrap the request value
        up in a tuple ala: request = {'id':('',)}
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys==None: return None

        index = self._index
        r     = None
        opr   = None

        # experimental code for specifing the operator
        operator = record.get('operator',self.useOperator)
        if not operator in self.operators :
            raise RuntimeError,"operator not valid: %s" % escape(operator)

        # depending on the operator we use intersection or union
        if operator=="or":  set_func = union
        else:               set_func = intersection

        # Range parameter
        range_parm = record.get('range',None)
        if range_parm:
            opr = "range"
            opr_args = []
            if range_parm.find("min")>-1:
                opr_args.append("min")
            if range_parm.find("max")>-1:
                opr_args.append("max")

        if record.get('usage',None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args=opr[0], opr[1:]

        if opr=="range":   # range search
            if 'min' in opr_args: lo = min(record.keys)
            else: lo = None
            if 'max' in opr_args: hi = max(record.keys)
            else: hi = None
            if hi:
                setlist = index.items(lo,hi)
            else:
                setlist = index.items(lo)

            for k, set in setlist:
                if isinstance(set, int):
                    set = IISet((set,))
                r = set_func(r, set)
        else: # not a range search
            for key in record.keys:
                set=index.get(key, None)
                if set is None:
                    set = IISet(())
                elif isinstance(set, int):
                    set = IISet((set,))
                r = set_func(r, set)

        if isinstance(r, int):  r=IISet((r,))
        if r is None:
            return IISet(), (self.id,)
        else:
            return r, (self.id,)
Esempio n. 24
0
 def _loadAux(self, index, term):
     '''the docId list for *term*.'''
     dl = index.get(term)
     if dl is None: return IISet()
     if isinstance(dl, int): return IISet((dl, ))
     return dl
Esempio n. 25
0
    def sortResults(self,
                    rs,
                    sort_index,
                    reverse=0,
                    limit=None,
                    merge=1,
                    actual_result_count=None,
                    b_start=0,
                    b_size=None):
        # Sort a result set using a sort index. Return a lazy
        # result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples
        #
        # The two 'for' loops in here contribute a significant
        # proportion of the time to perform an indexed search.
        # Try to avoid all non-local attribute lookup inside
        # those loops.
        _intersection = intersection
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        _None = None
        _keyerror = KeyError
        result = []
        append = result.append
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the resultset, reverse sorting
        # order and limit it, then reverse the resultset again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = b_end - b_start
            limit = b_start + b_size

        if merge and limit is None and (rlen >
                                        (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # This is rarely exercised in practice...

            length = 0

            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = _intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', _None)
                    if keys is not _None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    append((k, intset, _self__getitem__))
                    # Note that sort keys are unique.

            if reverse:
                result.sort(reverse=True)
            else:
                result.sort()
            sequence, slen = self._limit_sequence(result, length, b_start,
                                                  b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    append((key, did, _self__getitem__))
                    # The reference back to __getitem__ is used in case
                    # we do not merge now and need to intermingle the
                    # results with those of other catalogs while avoiding
                    # the cost of instantiating a LazyMap per result
            if merge:
                if reverse:
                    result.sort(reverse=True)
                else:
                    result.sort()
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence
        elif reverse:
            # Limit/sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            n = 0
            worst = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    if n >= limit and key <= worst:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[0], result[0]
                    else:
                        n += 1
                    worst = keys[0]
            result.reverse()
            if merge:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence
        elif not reverse:
            # Limit/sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            n = 0
            best = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    if n >= limit and key >= best:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[-1], result[-1]
                    else:
                        n += 1
                    best = keys[-1]
            if merge:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence

        return LazyMap(self.__getitem__,
                       result,
                       len(result),
                       actual_result_count=actual_result_count)
Esempio n. 26
0
    def index_object(self, documentId, obj, threshold=None):
        """index an object, normalizing the indexed value to an integer

           o Normalized value has granularity of one minute.

           o Objects which have 'None' as indexed value are *omitted*,
             by design.

           o Repeat by recurdef - a RFC2445 reccurence definition string

        """
        returnStatus = 0

        try:
            date_attr = getattr(obj, self.id)
            if safe_callable(date_attr):
                date_attr = date_attr()
        except AttributeError:
            return returnStatus

        recurdef = getattr(obj, self.attr_recurdef, None)
        if safe_callable(recurdef):
            recurdef = recurdef()

        if not recurdef:
            dates = [pydt(date_attr)]
        else:
            until = getattr(obj, self.attr_until, None)
            if safe_callable(until):
                until = until()

            dates = recurrence_sequence_ical(date_attr,
                                             recrule=recurdef,
                                             until=until)

        newvalues = IISet(map(dt2int, dates))
        oldvalues = self._unindex.get(documentId, _marker)
        if oldvalues is not _marker:
            oldvalues = IISet(oldvalues)

        if oldvalues is not _marker and newvalues is not _marker\
            and not difference(newvalues, oldvalues)\
            and not difference(oldvalues, newvalues):
            # difference is calculated relative to first argument, so we have to
            # use it twice here
            return returnStatus

        if oldvalues is not _marker:
            for oldvalue in oldvalues:
                self.removeForwardIndexEntry(oldvalue, documentId)
            if newvalues is _marker:
                try:
                    del self._unindex[documentId]
                except ConflictError:
                    raise
                except:
                    LOG.error("Should not happen: oldvalues was there,"
                              " now it's not, for document with id %s" %
                              documentId)

        if newvalues is not _marker:
            inserted = False
            for value in newvalues:
                self.insertForwardIndexEntry(value, documentId)
                inserted = True
            if inserted:
                # store tuple values in reverse index entries for sorting
                self._unindex[documentId] = tuple(newvalues)
                returnStatus = 1

        return returnStatus
Esempio n. 27
0
def extendedpathindex_search(self,
                             path,
                             default_level=0,
                             depth=-1,
                             navtree=0,
                             navtree_start=0,
                             tmpres=None):
    """
    path is either a string representing a
    relative URL or a part of a relative URL or
    a tuple (path,level).
    
    default_level specifies the level to use when no more specific
    level has been passed in with the path.

    level >= 0  starts searching at the given level
    level <  0  finds matches at *any* level
    
    depth let's you limit the results to items at most depth levels deeper
    than the matched path. depth == 0 means no subitems are included at all,
    with depth == 1 only direct children are included, etc. depth == -1, the
    default, returns all children at any depth.
    
    navtree is treated as a boolean; if it evaluates to True, not only the
    query match is returned, but also each container in the path. If depth
    is greater than 0, also all siblings of those containers, as well as the
    siblings of the match are included as well, plus *all* documents at the
    starting level.
    
    navtree_start limits what containers are included in a navtree search.
    If greater than 0, only containers (and possibly their siblings) at that
    level and up will be included in the resultset.

    """
    if isinstance(path, basestring):
        level = default_level
    else:
        level = int(path[1])
        path = path[0]

    if level < 0:
        # Search at every level, return the union of all results
        return multiunion([
            self.search(path, level, depth, navtree, navtree_start)
            for level in xrange(self._depth + 1)
        ])

    comps = filter(None, path.split('/'))

    if navtree and depth == -1:  # Navtrees don't do recursive
        depth = 1

    #
    # Optimisations
    #

    pathlength = level + len(comps) - 1
    if navtree and navtree_start > min(pathlength + depth, self._depth):
        # This navtree_start excludes all items that match the depth
        return IISet()
    if pathlength > self._depth:
        # Our search is for a path longer than anything in the index
        return IISet()

    if level == 0 and depth in (0, 1):
        # We have easy indexes for absolute paths where
        # we are looking for depth 0 or 1 result sets
        if navtree:
            # Optimized absolute path navtree and breadcrumbs cases
            result = []
            add = lambda x: x is not None and result.append(x)
            if depth == 1:
                # Navtree case, all sibling elements along the path
                convert = multiunion
                index = self._index_parents
            else:
                # Breadcrumbs case, all direct elements along the path
                convert = IISet
                index = self._index_items
            # Collect all results along the path
            for i in range(len(comps), navtree_start - 1, -1):
                parent_path = '/' + '/'.join(comps[:i])
                add(index.get(parent_path))
            return convert(result)

        if not path.startswith('/'):
            path = '/' + path
        if depth == 0:
            # Specific object search
            res = self._index_items.get(path)
            return res and IISet([res]) or IISet()
        else:
            # Single depth search
            return self._index_parents.get(path, IISet())

    # Avoid using the root set
    # as it is common for all objects anyway and add overhead
    # There is an assumption about all indexed values having the
    # same common base path
    if level == 0:
        indexpath = list(filter(None, self.getPhysicalPath()))
        minlength = min(len(indexpath), len(comps))
        # Truncate path to first different element
        for i in xrange(minlength):
            if indexpath[i] != comps[i]:
                break
            level += 1
        comps = comps[level:]

    if not comps and depth == -1:
        # Recursive search for everything
        return IISet(self._unindex)

    #
    # Core application of the indexes
    #

    pathset = tmpres  # Same as pathindex
    depthset = None  # For limiting depth

    if navtree and depth > 0:
        # Include the elements up to the matching path
        depthset = multiunion([
            self._index.get(None, {}).get(i, IISet())
            for i in range(min(navtree_start, level),
                           max(navtree_start, level) + 1)
        ])

    indexedcomps = enumerate(comps)
    if not navtree:
        # Optimize relative-path searches by starting with the
        # presumed smaller sets at the end of the path first
        # We can't do this for the navtree case because it needs
        # the bigger rootset to include siblings along the way.
        indexedcomps = list(indexedcomps)
        indexedcomps.reverse()

    for i, comp in indexedcomps:
        # Find all paths that have comp at the given level
        res = self._index.get(comp, {}).get(i + level)
        if res is None:  # Non-existing path; navtree is inverse, keep going
            pathset = IISet()
            if not navtree: return pathset
        pathset = intersection(pathset, res)

        if navtree and i + level >= navtree_start:
            depthset = union(
                depthset,
                intersection(pathset,
                             self._index.get(None, {}).get(i + level)))

    if depth >= 0:
        # Limit results to those that terminate within depth levels
        start = len(comps) - 1
        if navtree: start = max(start, (navtree_start - level))
        depthset = multiunion(
            filter(None, [depthset] + [
                intersection(pathset,
                             self._index.get(None, {}).get(i + level))
                for i in xrange(start, start + depth + 1)
            ]))

    if navtree or depth >= 0: return depthset
    return pathset
Esempio n. 28
0
    def __call__(self, context, **query):
        # return super(FacetedCatalog, self).__call__(context, **query)
        ctool = getToolByName(context, 'portal_faceted', None)
        if ctool:
            search = ctool.search
        else:
            logger.debug('portal_faceted not present, using portal_catalog')
            ctool = getToolByName(context, 'portal_catalog')
            search = ctool.searchResults

        # Also get query from Topic
        buildQuery = getattr(context, 'buildQuery', None)
        newquery = buildQuery and buildQuery() or {}
        formquery = None

        # Get query from Collection
        if HAS_PAT:
            if PACI.ICollection.providedBy(context):
                infos = ICollection_behavior(context)
                sort_order = ('descending'
                              if infos.sort_reversed else 'ascending')
                sort_on = infos.sort_on
                formquery = infos.query

        if ICollection.providedBy(context):
            getRawQuery = getattr(context, 'getRawQuery', lambda: [])
            formquery = getRawQuery()

            getSortOn = getattr(context, 'getSort_on', lambda: None)
            sort_on = getSortOn()

            if sort_on:
                getSortReversed = getattr(context, 'getSort_reversed',
                                          lambda: None)
                sort_order = getSortReversed()
                if sort_order:
                    sort_order = 'descending'
                else:
                    sort_order = 'ascending'
            else:
                sort_order = None

        if formquery is not None:
            newquery = parseFormquery(context, formquery, sort_on, sort_order)

        if not isinstance(newquery, dict):
            newquery = {}

        # Avoid mixing sorting params from faceted and collection
        if 'sort_on' not in query:
            query.pop('sort_order', None)

        if 'sort_on' in query and 'sort_order' not in query:
            newquery.pop('sort_order', None)

        newquery.update(query)

        notify(QueryWillBeExecutedEvent(context, newquery))

        # code above is unchanged from original code

        # example of query:
        #{'sort_order': 'descending',
        # 'Language': ['fr', ''],
        # 'portal_type': {'query': ['operationalobjective']},
        # 'sort_on': 'created',
        # ':has_child': {'query': {
        #     'portal_type': {'query': ['pstaction']},
        #     'planned_end_date': {'query': DateTime('2016/08/12 12:49:6.218718 GMT+2'), 'range': 'max'}}}}

        has_child_filters = [
            k for k in newquery.keys() if k.startswith(':has_child')
        ]
        if len(has_child_filters) > 1:
            raise Exception('We only support one :has_child filter')

        if has_child_filters:
            sort_on = newquery.pop('sort_on', None)
            sort_order = newquery.pop('sort_order', None)
            limit = newquery.pop('limit', None)
            b_start = int(newquery.pop('b_start', 0))
            b_size = newquery.pop('b_size', None)
            if b_size is not None:
                b_size = int(b_size)

            if b_size is not None:
                limit = b_start + b_size
            elif limit and b_size is None:
                b_size = limit

            has_child_filter = deepcopy(
                newquery[has_child_filters[0]]['query'])
            # be sure we don't do a useless sort and we have all results
            has_child_filter.pop('sort_on', None)
            has_child_filter.pop('sort_order', None)
            has_child_filter.pop('limit', None)
            has_child_filter.pop('b_size', None)
            children_results = search(**has_child_filter)
            parentRIDs = IISet()
            for b in children_results:
                # TODO: possible optimization is to add parentRID as brain metadata
                parentRID = ctool._catalog.uids.get('/'.join(
                    b.getPath().split('/')[:-1]))
                parentRIDs.add(parentRID)

            brains = search(**newquery)
            # brains should be a LazyMap, access direct to brains._seq which is a list of rids
            # instead of doing IISet(brain.getRID() for brain in brains)
            rs = weightedIntersection(IISet(brains._seq), parentRIDs)[1]

            rlen = len(rs)
            if sort_on is not None:
                # We only support single sort order
                sort_index = ctool._catalog.indexes[sort_on]
                reverse = 1 if sort_order == 'descending' else 0
                brains = ctool._catalog.sortResults(rs,
                                                    sort_index,
                                                    reverse,
                                                    limit,
                                                    merge=1,
                                                    actual_result_count=rlen,
                                                    b_start=b_start,
                                                    b_size=b_size)
            else:
                brains = brains.__class__(brains._func, rs, rlen, rlen)
        else:
            brains = search(**newquery)

        return brains
Esempio n. 29
0
 def test_sortResults_reversed(self):
     brains = self._catalog({'att1': 'att1'})
     rs = IISet([b.getRID() for b in brains])
     si = self._catalog.getIndex('num')
     result = self._catalog.sortResults(rs, si, reverse=True)
     self.assertEqual([r.num for r in result], list(reversed(range(100))))
Esempio n. 30
0
    def search(self,
               path,
               default_level=0,
               depth=-1,
               navtree=0,
               navtree_start=0):
        """
        path is either a string representing a
        relative URL or a part of a relative URL or
        a tuple (path,level).

        level >= 0  starts searching at the given level
        level <  0  not implemented yet
        """

        if isinstance(path, basestring):
            startlevel = default_level
        else:
            startlevel = int(path[1])
            path = path[0]

        absolute_path = isinstance(path, basestring) and path.startswith('/')

        comps = filter(None, path.split('/'))

        orig_comps = [''] + comps[:]
        # Optimization - avoid using the root set
        # as it is common for all objects anyway and add overhead
        # There is an assumption about catalog/index having
        # the same container as content
        if default_level == 0:
            indexpath = list(filter(None, self.getPhysicalPath()))
            while min(len(indexpath), len(comps)):
                if indexpath[0] == comps[0]:
                    del indexpath[0]
                    del comps[0]
                    startlevel += 1
                else:
                    break

        if len(comps) == 0:
            if depth == -1 and not navtree:
                return IISet(self._unindex.keys())

        # Make sure that we get depth = 1 if in navtree mode
        # unless specified otherwise

        orig_depth = depth
        if depth == -1:
            depth = 0 or navtree

        # Optimized navtree starting with absolute path
        if absolute_path and navtree and depth == 1 and default_level == 0:
            set_list = []
            # Insert root element
            if navtree_start >= len(orig_comps):
                navtree_start = 0
            # create a set of parent paths to search
            for i in range(len(orig_comps), navtree_start, -1):
                parent_path = '/'.join(orig_comps[:i])
                parent_path = parent_path and parent_path or '/'
                try:
                    set_list.append(self._index_parents[parent_path])
                except KeyError:
                    pass
            return multiunion(set_list)
        # Optimized breadcrumbs
        elif absolute_path and navtree and depth == 0 and default_level == 0:
            item_list = IISet()
            # Insert root element
            if navtree_start >= len(orig_comps):
                navtree_start = 0
            # create a set of parent paths to search
            for i in range(len(orig_comps), navtree_start, -1):
                parent_path = '/'.join(orig_comps[:i])
                parent_path = parent_path and parent_path or '/'
                try:
                    item_list.insert(self._index_items[parent_path])
                except KeyError:
                    pass
            return item_list
        # Specific object search
        elif absolute_path and orig_depth == 0 and default_level == 0:
            try:
                return IISet([self._index_items[path]])
            except KeyError:
                return IISet()
        # Single depth search
        elif absolute_path and orig_depth == 1 and default_level == 0:
            # only get objects contained in requested folder
            try:
                return self._index_parents[path]
            except KeyError:
                return IISet()
        # Sitemaps, relative paths, and depth queries
        elif startlevel >= 0:

            pathset = None  # Same as pathindex
            navset = None  # For collecting siblings along the way
            depthset = None  # For limiting depth

            if navtree and depth and \
                   self._index.has_key(None) and \
                   self._index[None].has_key(startlevel):
                navset = self._index[None][startlevel]

            for level in range(startlevel, startlevel + len(comps) + depth):
                if level - startlevel < len(comps):
                    comp = comps[level - startlevel]
                    if not self._index.has_key(
                            comp) or not self._index[comp].has_key(level):
                        # Navtree is inverse, keep going even for
                        # nonexisting paths
                        if navtree:
                            pathset = IISet()
                        else:
                            return IISet()
                    else:
                        pathset = intersection(pathset,
                                               self._index[comp][level])
                    if navtree and depth and \
                           self._index.has_key(None) and \
                           self._index[None].has_key(level+depth):
                        navset = union(
                            navset,
                            intersection(pathset,
                                         self._index[None][level + depth]))
                if level - startlevel >= len(comps) or navtree:
                    if self._index.has_key(None) and self._index[None].has_key(
                            level):
                        depthset = union(
                            depthset,
                            intersection(pathset, self._index[None][level]))

            if navtree:
                return union(depthset, navset) or IISet()
            elif depth:
                return depthset or IISet()
            else:
                return pathset or IISet()

        else:
            results = IISet()
            for level in range(0, self._depth + 1):
                ids = None
                error = 0
                for cn in range(0, len(comps)):
                    comp = comps[cn]
                    try:
                        ids = intersection(ids, self._index[comp][level + cn])
                    except KeyError:
                        error = 1
                if error == 0:
                    results = union(results, ids)
            return results